def craw(url):
ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}
rqg = requests.get(url, headers=ua)
html = rqg.content.decode('utf-8')
pat1='"url":"(.*?)","playable"'
news1=re.compile(pat1,re.S).findall(html)
list1=[]
for i in news1:
a=i.replace('\\','')
list1.append(a)
return list1
5、获取到电影的链接后通过链接访问每个电影的主页,按静态网页的爬取方法来获取电影的详细信息并保存到MySQL数据库里。
import requests
import re
from lxml import etree
import pymysql
def craw(url):
ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}
rqg = requests.get(url, headers=ua)
html = rqg.content.decode('utf-8')
pat1='"url":"(.*?)","playable"'
news1=re.compile(pat1,re.S).findall(html)
list1=[]
for i in news1:
a=i.replace('\\','')
list1.append(a)
return list1
def readurl(list):
for i in list:
ua = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"}
rqg = requests.get(i, headers=ua)
html = rqg.content.decode('utf-8')
print('url',i)
html=etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8'))
name=html.xpath('//div[@id="content"]/h1/span[1]/text()')
score=html.xpath('//div[@id="interest_sectl"]/div[1]/div[2]/strong/text()')
five=html.xpath('//div[@id="interest_sectl"]/div[1]/div[3]/div[1]/span[2]/text()')
four=html.xpath('//div[@id="interest_sectl"]/div[1]/div[3]/div[2]/span[2]/text()')
three=html.xpath('//div[@id="interest_sectl"]/div[1]/div[3]/div[3]/span[2]/text()')
too=html.xpath('//div[@id="interest_sectl"]/div[1]/div[3]/div[4]/span[2]/text()')
one=html.xpath('//div[@id="interest_sectl"]/div[1]/div[3]/div[5]/span[2]/text()')
contentlist=html.xpath('//div[@id="info"]')
y = 0
for e in contentlist:
i = e.xpath('string(.)')
contentlist[y] = i
y += 1
print(contentlist)
# 使用参数名创建连接
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='', db='newdb', charset='utf8',
connect_timeout=1000)
# 创建游标
cursor = conn.cursor()
sql = 'insert into movie (name,score,five,four,three,too,one,information) values(%s,%s,%s,%s,%s,%s,%s,%s)'
x=0
for n in name:
try:
a=score[x]
b=five[x]
c=four[x]
d=three[x]
e=too[x]
f=one[x]
g=contentlist[x]
except:
a=b=c=d=e=f=g='0'
print('电影名字:',n)
print('评分:',a)
print('5星:',b,',4星:',c,',3星:',d,',2星:',e,',1星:',f)
print('电影信息:',g)
x+=1
print('\n')
try:
cursor.execute(sql,(n,a,b,c,d,e,f,g))
conn.commit()
except:
print('插入数据失败')
else:
print('插入数据成功')
#print('上映日期:',date1[x],date2[x])
i=int(input('请输入你要爬取的数量:'))
url='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit='+str(i)+'&page_start=0'
b=craw(url)
readurl(b)