链家爬虫代码

#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time


loction_info = '''    1→杭州
    2→武汉
    3→北京
    按ENTER确认：'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
               '2':'wh',
               '3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限（万）:')
up = input('请输入价格上限（万）:')

inter_list = [(int(down),int(up))]

def half_inter(inter):
    lower = inter[0]
    upper = inter[1]
    delta = int((upper-lower)/2)
    inter_list.remove(inter)
    print('已经缩小价格区间',inter)
    inter_list.append((lower, lower+delta))
    inter_list.append((lower+delta, upper))

pagenum = {}
def get_num(inter):
    url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
    r = requests.get(url).text
    num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
    pagenum[(inter[0],inter[1])] = num
    return num

totalnum = get_num(inter_list[0])

judge = True
while judge:
    a = [get_num(x)>3000 for x in inter_list]
    if True in a:
        judge = True
    else:
        judge = False
    for i in inter_list:
        if get_num(i) > 3000:
            half_inter(i)
print('价格区间缩小完毕！')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
    totalpage = math.ceil(pagenum[i]/30)
    for j in range(1,totalpage+1):
        url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
        url_lst.append(url)
print('url列表获取完毕！')

info_lst = []
async def get_info(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=5) as resp:
            if resp.status != 200:
                url_lst_failed.append(url)
            else:
                url_lst_successed.append(url)
            r = await resp.text()
            nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
            # print('-------------------------------------------------------------')
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
            # print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
            # print('-------------------------------------------------------------')
            info_dic = {}
            index = 1
            print('开始抓取{}'.format(resp.url))
            print('开始抓取{}'.format(resp.url))
            print('开始抓取{}'.format(resp.url))
            for node in nodelist:
                try:
                    info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
                except:
                    info_dic['title'] = '/'
                try:
                    info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
                except:
                    info_dic['href'] = '/'
                try:
                    info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
                except:
                    info_dic['xiaoqu'] = '/'
                try:
                    info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
                except:
                    info_dic['huxing'] = '/'
                try:
                    info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
                except:
                    info_dic['area'] = '/'
                try:
                    info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
                except:
                    info_dic['chaoxiang'] = '/'
                try:
                    info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
                except:
                    info_dic['zhuangxiu'] = '/'
                try:
                    info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
                except:
                    info_dic['dianti'] = '/'
                try:
                    info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['louceng'] = '/'
                try:
                    info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['nianxian'] = '/'
                try:
                    info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
                except:
                    info_dic['guanzhu'] = '/'
                try:
                    info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
                except:
                    info_dic['daikan'] = '/'
                try:
                    info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
                except:
                    info_dic['fabu'] = '/'
                try:
                    info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
                except:
                    info_dic['totalprice'] = '/'
                try:
                    info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价','')
                except:
                    info_dic['unitprice'] = '/'
                if True in [info_dic['href'] in dic.values() for dic in info_lst]:
                    url_lst_duplicated.append(info_dic)
                else:
                    info_lst.append(info_dic)
                print('第{}条:    {}→房屋信息抓取完毕！'.format(index,info_dic['title']))
                index += 1
                info_dic = {}

start = time.time()

#首次抓取url_lst中的信息，部分url没有对其发起请求，不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

#将没有发起请求的url放入一个列表，对其进行循环抓取，直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
    if url not in url_lst_successed or url_lst_failed:
        url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
    tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
    loop.run_until_complete(asyncio.wait(tasks_unrequested))
    url_lst_unrequested = []
    for url in url_lst:
        if url not in url_lst_successed:
            url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('总共耗时{}秒'.format(end-start))

df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')

##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
#     resp = requests.get(url)
#     nodelist = etree.HTML(resp.text).xpath("//ul[@class='sellListContent']/li")
#     info_dic = {}
#     index = 1
#     print('开始抓取{}'.format(resp.url))
#     print('开始抓取{}'.format(resp.url))
#     print('开始抓取{}'.format(resp.url))
#     for node in nodelist:
#         try:
#             info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
#         except:
#             info_dic['title'] = '/'
#         try:
#             info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
#         except:
#             info_dic['href'] = '/'
#         try:
#             info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 0]
#         except:
#             info_dic['xiaoqu'] = '/'
#         try:
#             info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 1]
#         except:
#             info_dic['huxing'] = '/'
#         try:
#             info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
#         except:
#             info_dic['area'] = '/'
#         try:
#             info_dic['chaoxiang'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
#         except:
#             info_dic['chaoxiang'] = '/'
#         try:
#             info_dic['zhuangxiu'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
#         except:
#             info_dic['zhuangxiu'] = '/'
#         try:
#             info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 5]
#         except:
#             info_dic['dianti'] = '/'
#         try:
#             info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['louceng'] = '/'
#         try:
#             info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['nianxian'] = '/'
#         try:
#             info_dic['guanzhu'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
#         except:
#             info_dic['guanzhu'] = '/'
#         try:
#             info_dic['daikan'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
#         except:
#             info_dic['daikan'] = '/'
#         try:
#             info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
#         except:
#             info_dic['fabu'] = '/'
#         try:
#             info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
#         except:
#             info_dic['totalprice'] = '/'
#         try:
#             info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
#         except:
#             info_dic['unitprice'] = '/'
#         if True in [info_dic['href'] in dic.values() for dic in info_lst]:
#             url_lst_duplicated.append(info_dic)
#         else:
#             info_lst.append(info_dic)
#         print('第{}条:    {}→房屋信息抓取完毕！'.format(index, info_dic['title']))
#         index += 1
#         info_dic = {}
# end = time.time()
# print('实际获得{}条房源信息。'.format(len(info_lst)))
# print('总共耗时{}秒'.format(end-start))
链家爬虫代码_asyncio

你可能感兴趣的:(链家爬虫代码_asyncio)