python 网络爬虫 与数据库

这是一个简单的爬取豆瓣电影TOP250的代码,爬去了每一条电影的18个维度的数据,并且将他们存储在本地的mysql数据库中.

详细代码如下.

requests :请求网页,获取网页数据

lxml:使用xpath语法快速解析网页数据

# -*- coding: utf-8 -*-
"""
Created on Tue Jan 22 20:55:02 2019

@author: tide1
"""


import requests
from lxml import etree
import re
import time 
import pymysql
import numpy as np
'''
1.数据库操作
'''
#cursor
conn=pymysql.connect(host='localhost',user='dns',
                     passwd='123456',db='mydb',port=3306,charset='utf8') #
cursor=conn.cursor()  # 光标


cursor.execute("DROP TABLE IF EXISTS douban_movie")
sql = """CREATE TABLE douban_movie (
         movie_name  TEXT,
         director TEXT,
         writers TEXT,  
         actors TEXT,
         style TEXT,
         country TEXT,
         language TEXT,
         release_times TEXT,
         time TEXT,
         anthor_name TEXT,
         score TEXT,
         num_comments TEXT,
         five_star TEXT,
         four_star TEXT,
         three_star TEXT,
         two_star TEXT,
         one_star TEXT,
         better TEXT)default charset = utf8;"""
cursor.execute(sql)
#数据存储
def toSQL(a):
    cursor.execute('INSERT INTO douban_movie(movie_name,\
         director,writers,actors,style,country,language,release_times,\
         time,anthor_name,score,num_comments,five_star,four_star,three_star,two_star,one_star,better)\
         VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',(a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7],a[8],a[9],a[10],a[11],a[12],a[13],a[14],a[15],a[16],a[17]))

'''
2.爬取操作
'''

#复制 user-agent,伪chrome装浏览器
headers = {   "User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US)\
          AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16"}

def get_movie_url(url1):  #获取具体页面的URL函数
    html=requests.get(url1,headers=headers)
    selector=etree.HTML(html.text)
    movie_hrefs=selector.xpath('//div[@class="hd"]/a/@href') #提取url信息
    for movie in movie_hrefs:
        get_movie_info(movie)
    
def get_movie_info(url):  #获取具体信息,并且写入信息
    html=requests.get(url,headers=headers)
    selector=etree.HTML(html.text) #解析网页数据
    a=list()
    try:
    #1.电影名
      name=selector.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
      a.append(str(name))
    #2.导演
      director=selector.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0] 
      a.append(str(director))
    #3.编剧
      writers=selector.xpath('//*[@id="info"]/span[2]/span[2]')[0]
      writer=writers.xpath('string(.)')
      a.append(str(writer))
    #4.主演
      actors=selector.xpath('//*[@id="info"]/span[3]/span[2]')[0]
      actor=actors.xpath('string(.)')    
      a.append(str(actor))
    #5.类型
      styles=re.findall('(.*?)',html.text,re.S)
      style='/'.join(styles)
      a.append(str(style))
    #6.国家
      country=re.findall('制片国家/地区:(.*?)
',html.text,re.S) a.append(str(country)) #7.语言 language=re.findall('语言:(.*?)
',html.text,re.S) a.append(str(language)) #8.上映时间 release_times=re.findall('',html.text,re.S) a.append(str(release_times)) #9.片长 movie_time=re.findall('',html.text,re.S) a.append(str(movie_time)) #10.又名 anthor_names= country=re.findall('又名:(.*?)
',html.text,re.S) a.append(str(anthor_names)) #评分 score=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0] a.append(float(score)) #评价人数 num_comments=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0] a.append(str(num_comments)) #五星百分比 five_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[1]/span[2]/text()')[0] a.append(str(five_star)) #四星 four_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[2]/span[2]/text()')[0] a.append(str(four_star)) #三星 three_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[3]/span[2]/text()')[0] a.append(str(three_star)) #二星 two_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[4]/span[2]/text()')[0] a.append(str(two_star)) #一星 one_star=selector.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[5]/span[2]/text()')[0] a.append(str(one_star)) #比较 better_than=selector.xpath('//*[@id="interest_sectl"]/div[2]')[0] better=better_than.xpath('string(.)') a.append(str(better)) toSQL(a) except IndexError: pass if __name__=='__main__': urls=['https://movie.douban.com/top250?start={}'.format(str(i)) for i in range(0,250,25)] count=0 for url in urls: get_movie_url(url) time.sleep(10+np.random.normal(5)) count+=1 print(count) conn.commit()

 

你可能感兴趣的:(Python)