这是一个简单的爬取豆瓣电影TOP250的代码,爬去了每一条电影的18个维度的数据,并且将他们存储在本地的mysql数据库中.
详细代码如下.
requests :请求网页,获取网页数据
lxml:使用xpath语法快速解析网页数据
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 22 20:55:02 2019
@author: tide1
"""
import requests
from lxml import etree
import re
import time
import pymysql
import numpy as np
'''
1.数据库操作
'''
#cursor
conn=pymysql.connect(host='localhost',user='dns',
passwd='123456',db='mydb',port=3306,charset='utf8') #
cursor=conn.cursor() # 光标
cursor.execute("DROP TABLE IF EXISTS douban_movie")
sql = """CREATE TABLE douban_movie (
movie_name TEXT,
director TEXT,
writers TEXT,
actors TEXT,
style TEXT,
country TEXT,
language TEXT,
release_times TEXT,
time TEXT,
anthor_name TEXT,
score TEXT,
num_comments TEXT,
five_star TEXT,
four_star TEXT,
three_star TEXT,
two_star TEXT,
one_star TEXT,
better TEXT)default charset = utf8;"""
cursor.execute(sql)
#数据存储
def toSQL(a):
cursor.execute('INSERT INTO douban_movie(movie_name,\
director,writers,actors,style,country,language,release_times,\
time,anthor_name,score,num_comments,five_star,four_star,three_star,two_star,one_star,better)\
VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9],a[10],a[11],a[12],a[13],a[14],a[15],a[16],a[17]))
'''
2.爬取操作
'''
#复制 user-agent,伪chrome装浏览器
headers = { "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US)\
AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"}
def get_movie_url(url1): #获取具体页面的URL函数
html=requests.get(url1,headers=headers)
selector=etree.HTML(html.text)
movie_hrefs=selector.xpath('//div[@class="hd"]/a/@href') #提取url信息
for movie in movie_hrefs:
get_movie_info(movie)
def get_movie_info(url): #获取具体信息,并且写入信息
html=requests.get(url,headers=headers)
selector=etree.HTML(html.text) #解析网页数据
a=list()
try:
#1.电影名
name=selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
a.append(str(name))
#2.导演
director=selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
a.append(str(director))
#3.编剧
writers=selector.xpath('//*[@id="info"]/span[2]/span[2]')[0]
writer=writers.xpath('string(.)')
a.append(str(writer))
#4.主演
actors=selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
actor=actors.xpath('string(.)')
a.append(str(actor))
#5.类型
styles=re.findall('(.*?)',html.text,re.S)
style='/'.join(styles)
a.append(str(style))
#6.国家
country=re.findall('制片国家/地区:(.*?)
',html.text,re.S)
a.append(str(country))
#7.语言
language=re.findall('语言:(.*?)
',html.text,re.S)
a.append(str(language))
#8.上映时间
release_times=re.findall('',html.text,re.S)
a.append(str(release_times))
#9.片长
movie_time=re.findall('',html.text,re.S)
a.append(str(movie_time))
#10.又名
anthor_names= country=re.findall('又名:(.*?)
',html.text,re.S)
a.append(str(anthor_names))
#评分
score=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
a.append(float(score))
#评价人数
num_comments=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
a.append(str(num_comments))
#五星百分比
five_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[1]/span[2]/text()')[0]
a.append(str(five_star))
#四星
four_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[2]/span[2]/text()')[0]
a.append(str(four_star))
#三星
three_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[3]/span[2]/text()')[0]
a.append(str(three_star))
#二星
two_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[4]/span[2]/text()')[0]
a.append(str(two_star))
#一星
one_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[5]/span[2]/text()')[0]
a.append(str(one_star))
#比较
better_than=selector.xpath('//*[@id="interest_sectl"]/div[2]')[0]
better=better_than.xpath('string(.)')
a.append(str(better))
toSQL(a)
except IndexError:
pass
if __name__=='__main__':
urls=['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)]
count=0
for url in urls:
get_movie_url(url)
time.sleep(10+np.random.normal(5))
count+=1
print(count)
conn.commit()