三种方法抓取猫眼电影top100信息

分别使用BeautifulSoup,xpath,和正则表达式提取猫眼电影top100的信息。程序很简单,就不解释了,直接上程序吧。

# coding:utf-8
import requests
import re
from lxml import html
from bs4 import BeautifulSoup

url = 'http://maoyan.com/board/4?' 

def getResponse(url,par=None):
    try:
        response = requests.get(url,params=par)#params的用法还是很重要的,自己上网查一查吧
        response.raise_for_status()
        response.encoding = 'utf-8'
        return response
    except:
        exit('url 解析失败')

def bs4_info(response):
    soup = BeautifulSoup(response.text,'html.parser')
    names = [i.a.string for i in soup.find_all(name='p',attrs='name')]
    stars = [i.string.strip() for i in soup.find_all(name='p',attrs='star')]
    times = [i.string for i in soup.find_all(name='p',attrs='releasetime')] 
    scores_tag = [i.contents for i in soup.find_all(name='p',attrs='score')]
    scores = [item[0].string + item[1].string for item in scores_tag]

    return names,stars,times,scores

def lxml_info(response):
    element_html = html.fromstring(response.content.decode('utf-8'))#这个地方注意一下,不加decode的话中文的显示会有问题

    names = element_html.xpath("//p[@class='name']/a/text()")#text后面要加括号
    stars = [i.strip() for i in element_html.xpath("//p[@class='star']/text()")]
    times = [i for i in element_html.xpath("//p[@class='releasetime']/text()")]
    scores_integer = element_html.xpath("//i[@class='integer']/text()")
    scores_fraction = element_html.xpath("//i[@class='fraction']/text()")
    scores = [scores[0] + scores[1] for scores in zip(scores_integer,scores_fraction)]

    return names,stars,times,scores

def re_info(response):
    text = response.text

    name_pattern = re.compile(r'

)#括号表示要提取的内容 time_pattern = re.compile(r'

上映时间:(.{10}).*?

'
) star_pattern = re.compile(r'

.*?主演:(.*?)

'
,re.S) score_pattern = re.compile(r'

(.*?)(\d)

'
) names = name_pattern.findall(text) times = time_pattern.findall(text) stars = [x.strip() for x in star_pattern.findall(text)] scores = [score[0] + score[1] for score in score_pattern.findall(text)] return names,stars,times,scores names,stars,times,scores = [],[],[],[] for i in range(10): response = getResponse(url,par={'offset':str(10*i)}) name,star,time,score = re_info(response) names += name stars += star times += time scores += score

你可能感兴趣的:(tensorflow入门,Python基础)