我是写了一个py文件来封装一下,然后在爬取猫眼的py文件里直接调用,需要使用到pymysql库, 没有安装这个库的同学要事先安装一下,这里直接上代码

#coding=utf-8

import pymysql

class mysqlConn:

def get_conn(self, dbname):

"""提供你要连接的数据库名,并连接数据库"""

self.conn = pymysql.connect(

host="127.0.0.1",

user="root",

password="你的密码",

db=dbname, #可选择要连接的数据库名

charset="utf8"

)

self.cur = self.conn.cursor()

def exe_sql(self, sql):

"""执行不返回结果的sql语句, 例如增删改"""

self.cur.execute(sql)

self.conn.commit()

print("事物提交成功")

def select_sql(self, sql):

"""执行查询语句"""

self.cur.execute(sql)

return self.cur.fetchall()

def close_conn(self):

if self.cur:

self.cur.close()

if self.conn:

self.conn.close()

if name == "main":

#找一个数据库表来执行一下看能不能行

connection = mysqlConn()

connection.get_conn("school") #连接'school'数据库

sql = '''insert into student2 (name, nickname) values ("赵六", "六娃")'''

connection.exe_sql(sql)

connection.close_conn()

现在电影首页的内容需要的两个内容已经抓取完毕,剩下的内容需要到详情页来抓取,我们先写一段首页抓取的代码来看看效果

#coding=utf-8

import requests

from lxml import etree

from fake_useragent import UserAgent

这里用了一个可以随机模拟请求头的库,因为一天内爬太多次导致频繁要验证,用随意模拟的请求头会好很多

感兴趣的小伙伴可以自己'pip install fake_useragent', 不感兴趣可以直接用普通的请求头

headers = {

"User-Agent": UserAgent().random

}

url =

print("url: " + url)

resp = requests.get(url=url, headers=headers)

tree = etree.HTML(resp.text)

完整的图片地址,可以直接打开

img_ar = tree.xpath('//dl/dd//img[2]/@src')

只有地址的后半段,需要拼接''

urls_ar = tree.xpath('//dd/div[@class="movie-item film-channel"]/a/@href')

#打印出来看看数据爬到没有

print(img_ar)

print(urls_ar)

这里你可以尝试下其他写法,就不多重复了

爬取详情页的代码

headers = {

"User-Agent": UserAgent().random

}

url = ''

print("url: " + url)

resp = requests.get(url=url, headers=headers)

tree = etree.HTML(resp.content.decode("utf-8"))

name = str(tree.xpath('string(//h1)')) #转为py内置的str类型,保险点

print("正在储存电影<{}>......".format(name))

actors_ar = tree.xpath('//div//li/div[@class="info"]/a/text()') # 演员列表

types = tree.xpath('string(//li[@class="ellipsis"])').replace("\n", "").replace(" ", "") # 字符串

intro = str(tree.xpath('string(//span[@class="dra"])'))

actors = '|'.join(actors_ar).replace("\n", "").replace(" ", "") #将演员列表拼接为字符串

结果你们可以自己去打印一下,如果没有遇到验证码就能爬到了

最后的抓取猫眼电影的全部代码,封装成类,养成良好的代码习惯

import requests

from lxml import etree

from mysql_api import mysqlConn

from fake_useragent import UserAgent

from pymysql import err

class maoYan_spider:

headers = {

"User-Agent": UserAgent().random

}

def get_urls(self, url):

"""返回一个电影首页捕获到的所有海报地址和电影详情url"""

print("url: " + url)

resp = requests.get(url=url, headers=self.headers)

tree = etree.HTML(resp.text)

完整的图片地址,可以直接打开

img_ar = tree.xpath('//dl/dd//img[2]/@data-src')

只有地址的后半段,需要拼接''

urls_ar = tree.xpath('//dd/div[@class="movie-item film-channel"]/a/@href')

#只有py具有返回多个参数的特性,其他语言只能返回一个

return img_ar, urls_ar

def save_data(self, img_src, url):

"""将电影详情写入数据库"""

#print("url: " + url)

resp = requests.get(url=url, headers=self.headers)

tree = etree.HTML(resp.content.decode("utf-8"))

name = str(tree.xpath('string(//h1)'))

print("正在储存电影<{}>......".format(name))

if name == "":

print("遇到验证码申请XM代理, 程序停止")

return False

actors_ar = tree.xpath('//li/div[@class="info"]/a/text()') # 演员列表

types = tree.xpath('string(//li[@class="ellipsis"])').replace("\n", "").replace(" ", "") # 字符串

intro = str(tree.xpath('string(//span[@class="dra"])'))

actors = '|'.join(actors_ar).replace("\n", "").replace(" ", "") #将演员列表拼接为字符串

sql = 'insert into maoyan (m_name, m_type, m_src, m_link, m_intro, m_actors) values ("%s","%s","%s","%s","%s","%s")' % (name, types, img_src, url, intro, actors)

try:

self.connect.exe_sql(sql)

except err.ProgrammingError:

print("该条编码有问题,舍弃")

return True

def run(self):

self.connect = mysqlConn()

self.connect.get_conn("movies")

tag = True

#爬取前两页的电影

for i in range(2):

main_url = "(30 * i)

imgs, urls = self.get_urls(main_url)

if len(imgs) == 0:

print("遇到验证码, 程序停止")

print("再次尝试...")

imgs, urls = self.get_urls(main_url)

for img, url in zip(imgs, urls):

img = img.split('@')[0]

url = '' + url

tag = self.save_data(img, url)

while not tag:

tag = True

print("再次尝试...")

tag = self.save_data(img, url)

self.connect.close_conn()

if name == "main":

conn1 = mysqlConn()

conn1.get_conn("movies")

sql = """create table maoyan(

m_id int primary key auto_increment,

m_name varchar(30) not null,

m_type varchar(20) null,

m_src varchar(100) not null,

m_link varchar(100) not NULL,

m_intro text null,

m_actors text null

)default charset = utf8"""

conn1.exe_sql(sql)

conn1.close_conn()

spider = maoYan_spider()

spider.run()