Python爬虫系列之网页多线程爬取房源数据

Python爬虫系列之网页多线程爬取房源数据

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

代码仅供学习交流,请勿用于非法用途

一、准备数据库

create database house;

use house;

create table `house_jiazai`(
	`id` int primary key auto_increment,
	`HouseId` varchar(50) not null unique comment '房间id',
	`CheckinTime` varchar(50) default null comment '用户id',
	`Title` text default null comment '标题',
	`Road` text default null comment 'Road',
	`Nong` text default null comment 'Nong',
	`Hao` text default null comment 'Hao',
	`XiaoQu` text default null comment '小区',
	`Shi` varchar(2) default null comment '室',
	`Ting` varchar(2) default null comment '厅',
	`Wei` varchar(2) default null comment '卫',
	`AtFloor` varchar(3) default null comment '所在楼层',
	`TotalFloor` varchar(3) default null comment '总楼层数',
	`Area` varchar(4) default null comment '面积',
	`IsElevator` varchar(8) default null comment '是否电梯',
	`ZuType` varchar(3) default null comment '租期',
	`CoverImage` text default null comment 'banner',
	`Price` varchar(8) default null comment '价格',
	`PriceUnit` varchar(8) default null comment '价格单位',
	`YjRatio` varchar(8) default null comment '押金比例',
	`UseTypeId` varchar(8) default null comment '用途',
	`Linkman` varchar(20) default null comment '联系人姓名',
	`Email` varchar(40) default null comment '联系人邮箱',
	`Phone` varchar(18) default null comment '联系人电话',
	`Content` text default null comment '房源介绍',
	`TrafficStation` text default null comment '交通介绍',
	`LivingType` varchar(8) default null comment '出租类型',
	`IsAdditionalYj` varchar(8) default null comment '是否需要押金',
	`GeYongAmount` varchar(8) default null comment 'GeYongAmount',
	`Lng` varchar(28) default null comment '经度',
	`Lat` varchar(28) default null comment '纬度',
	`features` text default null comment '特色与设施',
	`pictureList` text default null comment '轮播图列表',
	`userType` varchar(20) default null comment '用户类型'
)engine=INNODB CHARSET=utf8;

二、代码实现

import requests
import json
import threading
import MySQLdb
from queue import Queue
import logging
import re
from bs4 import BeautifulSoup

'''
    @Author     :王磊
    @Date       :2019/9/19
    @Description:某网页数据爬取
'''

####################################################
################### 配置部分开始 ###################
####################################################
# 数据库账号
mysql_user = "root"
# 数据库密码
mysql_password = "root"
# 数据库名称
mysql_database = "house"
# 线程数(配置为电脑最大线程数)
threadNum = 8
####################################################
################### 配置部分结束 ###################
####################################################

headers = {
    "PlatformId": "2bf1fffd-4ec6-46ba-a068-69d45c2b6017",
    "PlatformType": "mobile",
    "Referer": "http://m.jiazaishanghai.com/House/Search",
    "Token": "",
    "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"
}
featureList = {"-1": "不限", "1001": "暖气", "1003": "浴缸", "1004": "户外空间", "1007": "健身", "1008": "厨房", "1009": "空调", "1010": "精装", "1011": "独卫", "1012": "阳台", "1013": "WIFI"}
zuTypeList = {"-1": "不限", "0": "年租", "1": "短租"}
houseUseTypeList = {"-1": "不限", "206": "住宅", "201": "工作室", "202": "住宅/工作室", "203": "商铺", "204": "写字楼", "205": "厂房/仓库"}
livingTypeList = {"-1": "不限", "1001": "整租", "1002": "合租"}
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(name)s:%(levelname)s:%(message)s')


class jiazaiSpider(threading.Thread):
    def __init__(self, pageQueue, *args, **kwargs):
        super(jiazaiSpider, self).__init__(*args, **kwargs)
        self.pageQueue = pageQueue

    def log(self, msg):
        '''
        info
        :param msg:
        :return:
        '''
        logging.info(msg)

    def getFeatures(self, features):
        '''
        获取设施基础列表
        :param features:
        :return:
        '''
        features_ = []
        if features:
            for feature in features:
                try:
                    features_.append(featureList[str(feature)])
                except Exception as e:
                    pass
        return features_

    def getPictureList(self, pictureList):
        '''
        获取轮播图列表
        :param pictureList:
        :return:
        '''
        pictureList_ = []
        if pictureList:
            for picture in pictureList:
                try:
                    pictureList_.append("https://api.jiazaishanghai.com/Image/Get?p=" + picture['Url'])
                except Exception as e:
                    continue
        return pictureList_

    def getZuType(self, zuType):
        '''
        获取出租类型
        :param zuType:
        :return:
        '''
        zuType_ = "长租"
        if zuType:
            try:
                zuType_ = zuTypeList[str(zuType)]
            except Exception as e:
                pass
        return zuType_

    def getHouseUseType(self, useTypeId):
        '''
        获取房屋用途
        :param useTypeId:
        :return:
        '''
        houseUseType = None
        if useTypeId:
            try:
                houseUseType = houseUseTypeList[str(useTypeId)]
            except Exception as e:
                pass
        return houseUseType

    def getLivingType(self, livingType):
        '''
        获取出租类型
        :param livingType:
        :return:
        '''
        livingType_ = None
        if livingType:
            try:
                livingType_ = livingTypeList[str(livingType)]
            except Exception as e:
                pass
        return livingType_

    def getHouseDetail(self, houseId):
        '''
        解析房屋信息
        :param houseId:
        :return:
        '''
        house = {}
        if houseId:
            url = "https://api.jiazaishanghai.com/api/House/GetDetail?houseId=" + houseId + "&id=0"
            try:
                houseDetail = getHtml(url)['data']
                try:
                    house['HouseId'] = houseDetail['house']['HouseId']
                except Exception as e:
                    house['HouseId'] = ""
                try:
                    house['CheckinTime'] = houseDetail['house']['CheckinTime']
                except Exception as e:
                    house['CheckinTime'] = ""
                try:
                    house['Title'] = houseDetail['house']['Title']
                except Exception as e:
                    house['Title'] = ""
                try:
                    house['Road'] = houseDetail['house']['Road']
                except Exception as e:
                    house['Road'] = ""
                try:
                    house['Nong'] = houseDetail['house']['Nong']
                except Exception as e:
                    house['Nong'] = ""
                try:
                    house['Hao'] = houseDetail['house']['Hao']
                except Exception as e:
                    house['Hao'] = ""
                try:
                    house['XiaoQu'] = houseDetail['house']['XiaoQu']
                except Exception as e:
                    house['XiaoQu'] = ""
                try:
                    house['Shi'] = houseDetail['house']['Shi']
                except Exception as e:
                    house['Shi'] = ""
                try:
                    house['Ting'] = houseDetail['house']['Ting']
                except Exception as e:
                    house['Ting'] = ""
                try:
                    house['Wei'] = houseDetail['house']['Wei']
                except Exception as e:
                    house['Wei'] = ""
                try:
                    house['AtFloor'] = houseDetail['house']['AtFloor']
                except Exception as e:
                    house['AtFloor'] = ""
                try:
                    house['TotalFloor'] = houseDetail['house']['TotalFloor']
                except Exception as e:
                    house['TotalFloor'] = ""
                try:
                    house['Area'] = houseDetail['house']['Area']
                except Exception as e:
                    house['Area'] = ""
                try:
                    house['IsElevator'] = houseDetail['house']['IsElevator']
                except Exception as e:
                    house['IsElevator'] = ""
                try:
                    house['ZuType'] = self.getZuType(houseDetail['house']['ZuType'])
                except Exception as e:
                    house['ZuType'] = ""
                try:
                    house['CoverImage'] = "https://api.jiazaishanghai.com/Image/Get?p=" + houseDetail['house']['CoverImage']
                except Exception as e:
                    house['CoverImage'] = ""
                try:
                    house['Price'] = houseDetail['house']['Price']
                except Exception as e:
                    house['Price'] = ""
                try:
                    house['PriceUnit'] = houseDetail['house']['PriceUnit']
                except Exception as e:
                    house['PriceUnit'] = ""
                try:
                    house['YjRatio'] = houseDetail['house']['YjRatio']
                except Exception as e:
                    house['YjRatio'] = ""
                try:
                    house['UseTypeId'] = self.getHouseUseType(houseDetail['house']['UseTypeId'])
                except Exception as e:
                    house['UseTypeId'] = ""
                try:
                    house['Linkman'] = houseDetail['house']['Linkman']
                except Exception as e:
                    house['Linkman'] = ""
                try:
                    house['Email'] = houseDetail['house']['Email']
                except Exception as e:
                    house['Email'] = ""
                try:
                    house['Phone'] = houseDetail['house']['Phone']
                except Exception as e:
                    house['Phone'] = ""
                try:
                    htmp = str(bytes(houseDetail['house']['Content'], encoding='utf-8').decode('utf-8').encode('gbk', 'ignore').decode('gbk'))
                    soup = BeautifulSoup(htmp, "html.parser")
                    ps = soup.find_all("p")
                    content = ""
                    for p in ps:
                        style = p.find_all("style")
                        if style:
                            continue
                        content += p.text + "\r\n"
                    if not content or content == "" or content == "\r\n":
                        req = re.compile(r'<.*?>', re.S)
                        content = req.sub('', htmp)
                    house['Content'] = content
                except Exception as e:
                    house['Content'] = ""
                try:
                    house['TrafficStation'] = houseDetail['house']['TrafficStation']
                except Exception as e:
                    house['TrafficStation'] = ""
                try:
                    house['LivingType'] = self.getLivingType(houseDetail['house']['LivingType'])
                except Exception as e:
                    house['LivingType'] = ""
                try:
                    house['IsAdditionalYj'] = houseDetail['house']['IsAdditionalYj']
                except Exception as e:
                    house['IsAdditionalYj'] = ""
                try:
                    house['GeYongAmount'] = houseDetail['house']['GeYongAmount']
                except Exception as e:
                    house['GeYongAmount'] = ""
                try:
                    house['Lng'] = houseDetail['house']['Lng']
                except Exception as e:
                    house['Lng'] = ""
                try:
                    house['Lat'] = houseDetail['house']['Lat']
                except Exception as e:
                    house['Lat'] = ""
                try:
                    house['features'] = str(self.getFeatures(houseDetail['features'])).replace("\'", "\"")
                except Exception as e:
                    house['features'] = ""
                try:
                    house['pictureList'] = str(self.getPictureList(houseDetail['pictureList'])).replace("\'", "\"")
                except Exception as e:
                    house['pictureList'] = ""
                try:
                    house['userType'] = houseDetail['userType']
                except Exception as e:
                    house['userType'] = ""
            except Exception as e:
                return None
        return house

    def pipLine(self, house):
        '''
        储存
        :param house:
        :return:
        '''
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into house_jiazai(HouseId, CheckinTime, Title, Road, Nong, Hao, XiaoQu, Shi, Ting, Wei, AtFloor, TotalFloor, Area, IsElevator, ZuType, CoverImage, Price, PriceUnit, YjRatio, UseTypeId, Linkman, Email, Phone, Content, TrafficStation, LivingType, IsAdditionalYj, GeYongAmount, Lng, Lat, features, pictureList, userType) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (house['HouseId'], house['CheckinTime'], house['Title'], house['Road'], house['Nong'], house['Hao'], house['XiaoQu'], house['Shi'], house['Ting'], house['Wei'], house['AtFloor'], house['TotalFloor'], house['Area'], house['IsElevator'], house['ZuType'], house['CoverImage'], house['Price'], house['PriceUnit'], house['YjRatio'], house['UseTypeId'], house['Linkman'], house['Email'], house['Phone'], house['Content'], house['TrafficStation'], house['LivingType'], house['IsAdditionalYj'], house['GeYongAmount'], house['Lng'], house['Lat'], house['features'], house['pictureList'], house['userType']))
            conn.commit()
        except Exception as e:
            logging.error(e)
            pass

    def run(self):
        '''
        run
        :return:
        '''
        while True:
            if self.pageQueue.empty():
                break
            page = self.pageQueue.get()
            houseList = getHouseList(page)['list']
            for house in houseList:
                houseId = house['HouseId']
                houseDetail = self.getHouseDetail(houseId)
                if houseDetail:
                    self.log(houseDetail)
                    self.pipLine(houseDetail)


def getHtml(url):
    while True:
        try:
            resp = requests.get(url, headers=headers, timeout=10)
            return json.loads(resp.content.decode("utf-8"))
        except Exception as e:
            continue


def getHouseList(page):
    '''
    获取房屋列表
    :param page:
    :return:
    '''
    url = "https://api.jiazaishanghai.com/api/HouseNew/GetList?CityCode=1001&PageIndex=" + str(page) + "&PageSize=10&IsElevator=-1&keywords=&orderBy=LastUpdateTime%20desc&isCustomRange=false&customRangeValue%5B%5D=0&customRangeValue%5B%5D=100&houseBrandId="
    houseList = getHtml(url)['data']
    return houseList


def getTotalPage():
    '''
    获取总页数
    :return:
    '''
    total = getHouseList(1)['total']
    return total // 10 if total % 10 == 0 else (total // 10) + 1


def main():
    totalPage = getTotalPage()
    pageQueue = Queue(0)
    for i in range(1, totalPage + 1):
        pageQueue.put(i)
    for j in range(threadNum):
        j = jiazaiSpider(pageQueue)
        j.start()


if __name__ == '__main__':
    main()

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

你可能感兴趣的:(Python)