python第三天(二) Lxml爬虫

python第三天(二) Lxml爬虫_第1张图片

猫眼中由一个界面链接到另一个界面,爬取两个界面的信息

import requests
from lxml import etree
import csv


headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

def get_url(url):
    res = requests.get(url,headers=headers)
    html = etree.HTML(res.text)
    infos = html.xpath('//dl[@class="board-wrapper"]/dd')#网页接口,定位超链接
    for info in infos:
        name = info.xpath('div/div/div[1]/p[1]/a/text()')[0]
        info_url = 'http://maoyan.com' + info.xpath('div/div/div[1]/p[1]/a/@href')[0]
        star = info.xpath('div/div/div[1]/p[2]/text()')[0].strip()
        release_time = info.xpath('div/div/div[1]/p[3]/text()')[0].strip()
        score_1 = info.xpath('div/div/div[2]/p/i[1]/text()')[0]
        score_2 = info.xpath('div/div/div[2]/p/i[2]/text()')[0]
        score = score_1 + score_2
        # print(name,star,release_time,score,info_url)
        get_info(info_url,name,star,release_time,score)

def get_info(url,name,star,time,score):
    res = requests.get(url, headers=headers)
    html = etree.HTML(res.text)
    style = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/text()')[0]
    long_time = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()')[0].split('/')[1].strip()
    print(name,star,time,score,style,long_time)
    writer.writerow([name,star,time,score,style,long_time])


fp = open('maoyan_2.csv','w',encoding='utf-8',newline='')
writer = csv.writer(fp)
writer.writerow(['name','star','time','score','style','long_time'])
url = 'http://maoyan.com/board/4'
get_url(url)
python第三天(二) Lxml爬虫_第2张图片

由一个界面链接到相对应的界面,并且保证相对应的数据信息顺序不发生错乱。使用函数的传递。

链家长沙二手网网站爬取:

python第三天(二) Lxml爬虫_第3张图片
import requests
from lxml import etree

url = 'https://cs.lianjia.com/ershoufang/pg1/'

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

res = requests.get(url,headers=headers)
html = etree.HTML(res.text)
infos = html.xpath('//ul[@class="sellListContent"]/li')
for info in infos:
    title = info.xpath('div[1]/div[1]/a/text()')[0]
    name = info.xpath('div[1]/div[2]/div/a/text()')[0]
    all_info = info.xpath('div[1]/div[2]/div/text()')[0]
    all_info_1 = all_info.split(' | ')
    room = all_info_1[1]
    area = all_info_1[2]
    orient = all_info_1[3]
    style = all_info_1[4]
    if len(all_info_1) == 6:
        lift = all_info_1[5]
    else:
        lift = '无电梯'
    price = info.xpath('div[1]/div[6]/div[1]/span/text()')[0]
    print(title,name,room,area,orient,style,lift,price)

链家中我们在原来的基础上添加了if语句,判断房屋是否有电梯。


python第三天(二) Lxml爬虫_第4张图片

你可能感兴趣的:(python第三天(二) Lxml爬虫)