python爬虫:使用selenium + ChromeDriver爬取途家网

本文链接:https://blog.csdn.net/Yeoman92/article/details/83105318

代码写的比较粗糙和不规范,请见谅。欢迎交流!
完整代码及说明,请参考 https://gitee.com/yeoman92/tujia_craw
说明
计算机网络协议大全:https://blog.csdn.net/ghw15221836342/article/details/100191924
本站(途家网https://www.tujia.com)通过常规抓页面的方法不能获取数据,可以使用selenium + ChromeDriver来获取页面数据。
0 脚本执行顺序与说明

0.1 先执行craw_url.py,获得所有房子详情页的url
0.2 执行slice_url.py,把所有的url等份,便于后续作多线程爬取
0.3 执行craw.py,获取每个房子的具体数据

1 注意

1.1 本站的数据为动态加载,用到了selenium + ChromeDriver来获取页面数据
1.2 项目中附有chromedriver.exe,需要安装谷歌浏览器(如果运行不了,可能是浏览器和chromedriver.exe版本不对应,对应的浏览器版本为69)
1.3 注意driver模拟操作后,需要等待1-2s后才能获取到数据
1.4 本站有反爬,每一次页面操作设置睡眠6s即可
1.5 chrome_options.add_argument(“headless”) 设置为不打开浏览器界面

2 爬取内容

2.1 途家网https://www.tujia.com/unitlist?cityId=10
2.2 爬取字段及说明见截图

截图

在这里插入图片描述
在这里插入图片描述在这里插入图片描述在这里插入图片描述
代码
1 craw_url.py (获得所有房子详情页的url)

#! /usr/bin/env python

-- coding: utf-8 --

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import os

启动driver

def init_driver(url):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(“headless”) # 不打开浏览器

driver_path = "./bin/chromedriver.exe"
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=driver_path)

driver.get(url)
# html = driver.page_source
# print(html.encode("GBK",'ignore'))

# time.sleep(3)
return driver

如果文件存在,则删除

def del_file(file_path):
if os.path.exists(file_path):
os.remove(file_path)

获取页面url

def get_url(drive):
# 设置等待
wait = WebDriverWait(driver, 10)
# 关闭弹出的广告层
wait.until(EC.presence_of_element_located((By.XPATH, ‘//*[@id=“j-tjdc-close”]’))).click()
# 获取总页数
total_str = driver.find_elements_by_class_name(‘pageItem’)[-1].get_attribute(‘page-data’)
total = int(total_str)
# 点击下一页
click_num = 0
while click_num < total:
driver.find_elements_by_class_name(‘pageItem’)[-2].click()
click_num += 1
time.sleep(6)

    # 每一页的项数
    item = driver.find_elements_by_class_name('searchresult-cont')
    item_num = len(item)
    # 获取到该页面所有项的url
    for i in range(item_num):
        xpath = '//*[@id="unitList-container"]/div/div[' + str(i+1) + ']/div[2]/div[1]/h3/a'
        url = driver.find_element_by_xpath(xpath).get_attribute('href')
        print(str(i) + '\t' + url)
        # 把url写到本地
        with open('./data/url/url.txt', 'a', encoding='utf-8') as f:
            f.write(url + '\n')

close_driver(driver)

def close_driver(driver):
driver.quit()

if name == ‘main’:
root_url = ‘https://www.tujia.com/unitlist?startDate=2018-12-10&endDate=2018-12-11&cityId=10&ssr=off’
driver = init_driver(root_url)
del_file(’./data/url/url.txt’)
get_url(driver)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

2 slice_url.py(把所有的url等份,便于后续作多线程爬取)

#! /usr/bin/env python

-- coding: utf-8 --

import math

url比较多,一次性爬取可能会出现问题,分多步爬取

def main(slice_num):
# 读取所有的url
with open(’./data/url/url.txt’, ‘r’) as f:
urls = f.readlines()

urls_num = len(urls)
step = math.ceil(urls_num / slice_num)

# 写url
for i in range(slice_num):
    with open('./data/url/url_' + str(i+1) + '.txt', 'w', encoding='utf-8') as f:
        for j in range(step*i, step*(i+1)):
            try:
                f.write(urls[j])
            except:
                break

if name == ‘main’:
# 分30等份
main(30)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

3 craw.py(获取每个房子的具体数据)

#! /usr/bin/env python

-- coding: utf-8 --

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os
import time
import threading

启动driver

def init_driver(url, index):
global threads
threads[‘Thread_’ + str(index)] += 1
print(‘Thread_’ + str(index) + ‘\t’ + str(threads[‘Thread_’ + str(index)]))

chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument("headless") # 不打开浏览器

driver_path = "./bin/chromedriver.exe"
driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path)

try:
    driver.get(url)
except:
    pass
# html = driver.page_source
# print(html.encode("GBK",'ignore'))

# time.sleep(2)
return driver

def close_driver(driver):
driver.quit()

如果文件存在,则删除

def del_file(file_path):
if os.path.exists(file_path):
os.remove(file_path)

读取本地的url

def read_url(file_path):
with open(file_path, ‘r’) as f:
urls = f.readlines()
return urls

获取页面数据

def get_data(driver, file_path, index):
try:
# 店名,价格,房屋标签,支付标签,优势标签
name = driver.find_element_by_xpath(’//div[@class=“house-name”]’).text
price = ‘’
try:
price = driver.find_element_by_xpath(’//a[@class=“present-price”]’).text
except:
pass
# 房屋面积
area = ‘’
try:
house_type_element = driver.find_element_by_xpath(’//[@id=“houseInfo”]/div/div/div[1]/div[3]/ul/li[2]’)
ActionChains(driver).move_to_element(house_type_element).perform()
area = driver.find_element_by_xpath(’//
[@id=“houseInfo”]/div/div/div[1]/div[3]/ul/li[2]/div’).text
except:
pass

    room_tag = ''
    try:
        room_tag = driver.find_element_by_xpath('//ul[@class="room-tag"]').text.replace('\n', ' ')
    except:
        pass
    pay_tag = ''
    try:
        pay_tag = driver.find_element_by_xpath('//ul[@class="pay-tag"]').text.replace('\n', ' ')
    except:
        pass
    advan_tag = ''
    try:
        advan_tag = driver.find_element_by_xpath('//div[@class="hotel-advan-tag"]').text.replace('\n', ' ')
    except:
        pass



    # 房屋守则
    house_rules = ''
    try:
        house_rules_all = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li')
        house_rules_dis = driver.find_elements_by_xpath('//*[@id="unitcheckinneedtoknow"]/div[2]/div[2]/div[5]/ol/li[@class="not"]')
        house_rules = ''
        for item in house_rules_all:
            house_rules += item.text + ' '
        for item in house_rules_dis:
            if item.text:
                house_rules = house_rules.replace(item.text + ' ', '')
        # print(house_rules.encode('gbk', 'ignore').decode('gbk'))
    except:
        pass

    # 设施服务
    facility_service = ''
    # try:
        # 点击查看更多
    scrollTop = 800
    success = False
    while not success:
        try:
            js = "var q=document.documentElement.scrollTop=800"
            driver.execute_script(js)
            driver.find_element_by_xpath('//*[@id="facilityshowmore"]/a').click()
            success = True
        except:
            scrollTop += 100
        time.sleep(1)
    # 分类,内容
    try:
        category_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/h5')
        # print(category_item)
        content_item = driver.find_elements_by_xpath('//*[@id="listWrap"]/ul')
        # print(content_item)
        for index, category_ in enumerate(category_item):
            category = category_.text
            content = content_item[index].text.replace('\n', ' ')
            if category:
                facility_service += category + '('
                facility_service += content + ')  '
    except:
        pass

    try:
        facility_dis = driver.find_elements_by_xpath('//*[@id="listWrap"]//li[@class="i-not"]')
        for item in facility_dis:
            # print(item)
            if item.text:
                facility_service = facility_service.replace(item.text + ' ', '')
            # print(item.text.encode('gbk', 'ignore').decode('gbk'),end=' ')
        # print(facility_service.encode('gbk', 'ignore').decode('gbk'))
    except:
        pass

    # 房东信息
    # 房东类型
    landlord_type = ''
    try:
        landlord_type = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/h2/span').text
    except:
        pass
    # 房东认证
    landlord_authentication = ''
    try:
        landlord_authentication = driver.find_element_by_xpath('//*[@id="landlordInfo"]/div/div[2]/div/div[2]').text
    except:
        pass
    # 其他房屋数
    landlord_other_house_num = ''
    try:
        landlord_other_house_num = driver.find_element_by_xpath('//div[@class="landlord-other-house"]/h2/span').text
    except:
        pass
    # print(landlord_type)
    # print(landlord_authentication)
    # print(landlord_other_house_num)

    # # 评价
    # # 综合评分,单项评分,评论数,带照片评论数
    overall_score = ''
    single_score = ''
    comment_sum = ''
    comment_photo_sum = ''
    try:
        overall_score = driver.find_element_by_xpath('//*[@id="overallScore"]').text
        single_score = driver.find_element_by_xpath('//*[@id="comment-summary"]/div[2]/div[1]/div[2]').text.replace('分', '')
        comment_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[1]/span').text.replace('(', '').replace(')', '')
        comment_photo_sum = driver.find_element_by_xpath('//*[@id="comment_filter"]/li[2]/span').text.replace('(', '').replace(')', '')
    except:
        pass

    # print('Thread_' + str(index) + '\t' + str(threads['Thread_' + str(index)]), end='\t')
    # print('\tThread_' + str(index))
    # # 先用 GBK 编码,加个 ignore 丢弃错误的字符,然后再解码
    print('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk'))
    # print('\t----价格----\t' + price.encode('gbk', 'ignore').decode('gbk'))
    print('\t--建筑面积--\t' + area.encode('gbk', 'ignore').decode('gbk'))
    # print('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk'))
    # print('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk'))
    # print('\t----优势----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--设施服务--\t' + facility_service.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--房屋守则--\t' + house_rules.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--房东类型--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--房东认证--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--其他房数--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--综合评分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--单项评分--\t' + single_score.encode('gbk', 'ignore').decode('gbk'))
    # print('\t---评论数---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk'))
    # print('\t--照评论数--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk'))


    # 写入数据到本地
    with open(file_path, 'a', encoding='utf-8') as f:
        f.write('--------------------------------------------------------------\n')
        f.write('\t----店名----\t' + name.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t----价格----\t' + price.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--建筑面积--\t' + area.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t----房屋----\t' + room_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t----支付----\t' + pay_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t----优势----\t' + advan_tag.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--设施服务--\t' + facility_service.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--房屋守则--\t' + house_rules.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--房东类型--\t' + landlord_type.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--房东认证--\t' + landlord_authentication.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--其他房数--\t' + landlord_other_house_num.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--综合评分--\t' + overall_score.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--单项评分--\t' + single_score.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t---评论数---\t' + comment_sum.encode('gbk', 'ignore').decode('gbk') + '\n')
        f.write('\t--照评论数--\t' + comment_photo_sum.encode('gbk', 'ignore').decode('gbk') + '\n')

    # 获取当前页评论
    get_data_comment(driver, file_path)

    # 评论内容
    # 评论总页数
    comment_page_num = 1
    try:
        comment_page_num_str = driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-1].get_attribute('page-data')
        comment_page_num = int(comment_page_num_str)
    except:
        pass
    # 点击下一页
    if comment_page_num > 1:
        click_num = 0
        while click_num < comment_page_num:
            # 当前页最后一项评论的时间
            try:
                last_item = driver.find_element_by_xpath('//*[@id="comment_list"]/li[1]/div[1]/ul/li[last()]/div[2]/div[1]/div/span[2]').text
                date = last_item.replace('-', '')[:6]
                # 日期大于2017年9月的
                if int(date) < 201709:
                    break
            except:
                pass
            # print(date.encode('gbk', 'ignore').decode('gbk'))
            # 滑动到底部
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            time.sleep(2)
            try:
                driver.find_elements_by_xpath('//*[@id="comment_list"]/li[1]/div[2]/ul/li')[-2].click()
            except:
                break
            '//*[@id="comment_list"]/li[1]/div[2]/ul/li[7]'
            click_num += 1
            time.sleep(4)
            # 获取当前页评论
            get_data_comment(driver, file_path)

    close_driver(driver)
except:
    print('error')
    close_driver(driver)

获取评论模块数据

def get_data_comment(driver, file_path):
try:
# 当前页评论数
comment_curr_page = driver.find_elements_by_xpath(’//[@id=“comment_list”]/li[1]/div[1]/ul/li’)
comment_curr_page_num = len(comment_curr_page)
for index in range(comment_curr_page_num):
xpath_head = '//
[@id=“comment_list”]/li[1]/div[1]/ul/li[’ + str(index + 1) + ‘]’
# 评论人
comment_person = driver.find_element_by_xpath(xpath_head + ‘/div[2]/div[1]/div/span[1]’).text
# 评论时间
comment_time = driver.find_element_by_xpath(xpath_head + ‘/div[2]/div[1]/div/span[2]’).text.replace(‘点评’, ‘’)
# 评论内容
comment_content = driver.find_element_by_xpath(xpath_head + ‘/div[2]/div[2]’).text

        # 是否回复
        comment_replay = ''
        try:
            comment_replay = driver.find_element_by_xpath(xpath_head + '/div[2]/div[4]/div[1]/div[2]/p').text.replace(
                ':', '')
        except:
            pass
        # print('\t---------------------评论---------------------')
        # print('\t\t---评论人---\t' + comment_person.encode('gbk', 'ignore').decode('gbk'))
        # print('\t\t---评论时间---\t' + comment_time.encode('gbk', 'ignore').decode('gbk'))
        # print('\t\t---评论内容---\t' + comment_content.encode('gbk', 'ignore').decode('gbk'))
        # print('\t\t---是否回复---\t' + comment_replay.encode('gbk', 'ignore').decode('gbk'))

        # 写入评论数据到本地
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write('\t---------------------评论---------------------\n')
            f.write('\t\t---评论人---\t' + comment_person.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t\t--评论时间--\t' + comment_time.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t\t--评论内容--\t' + comment_content.encode('gbk', 'ignore').decode('gbk') + '\n')
            f.write('\t\t--是否回复--\t' + comment_replay.encode('gbk', 'ignore').decode('gbk') + '\n')
except:
    pass

def main(index):
urls = read_url(’./data/url/url_’ + str(index) + ‘.txt’)
del_file(’./data/house/data_’ + str(index) + ‘.txt’)
for url in urls:
driver = init_driver(url, index)
get_data(driver, ‘./data/house/data_’ + str(index) + ‘.txt’, index)

if name == ‘main’:
# 每次运行使用10个线程来爬取
global threads
threads = {}
# 上文craw_url中共切分成30份,这里我份三次运行,我的电脑只能开10个线程。(1-11,11-21,21-31)
for index in range(1, 11):
threads[‘Thread_’ + str(index)] = 0
thread = threading.Thread(target=main, args=(index,))
thread.start()
————————————————
版权声明:本文为CSDN博主「Yeoman92」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/yeoman92/article/details/83105318

你可能感兴趣的:(Spider-Man)