代码仅供学习交流,请勿用于非法用途
create database house;
use house;
create table `house_jiazai`(
`id` int primary key auto_increment,
`HouseId` varchar(50) not null unique comment '房间id',
`CheckinTime` varchar(50) default null comment '用户id',
`Title` text default null comment '标题',
`Road` text default null comment 'Road',
`Nong` text default null comment 'Nong',
`Hao` text default null comment 'Hao',
`XiaoQu` text default null comment '小区',
`Shi` varchar(2) default null comment '室',
`Ting` varchar(2) default null comment '厅',
`Wei` varchar(2) default null comment '卫',
`AtFloor` varchar(3) default null comment '所在楼层',
`TotalFloor` varchar(3) default null comment '总楼层数',
`Area` varchar(4) default null comment '面积',
`IsElevator` varchar(8) default null comment '是否电梯',
`ZuType` varchar(3) default null comment '租期',
`CoverImage` text default null comment 'banner',
`Price` varchar(8) default null comment '价格',
`PriceUnit` varchar(8) default null comment '价格单位',
`YjRatio` varchar(8) default null comment '押金比例',
`UseTypeId` varchar(8) default null comment '用途',
`Linkman` varchar(20) default null comment '联系人姓名',
`Email` varchar(40) default null comment '联系人邮箱',
`Phone` varchar(18) default null comment '联系人电话',
`Content` text default null comment '房源介绍',
`TrafficStation` text default null comment '交通介绍',
`LivingType` varchar(8) default null comment '出租类型',
`IsAdditionalYj` varchar(8) default null comment '是否需要押金',
`GeYongAmount` varchar(8) default null comment 'GeYongAmount',
`Lng` varchar(28) default null comment '经度',
`Lat` varchar(28) default null comment '纬度',
`features` text default null comment '特色与设施',
`pictureList` text default null comment '轮播图列表',
`userType` varchar(20) default null comment '用户类型'
)engine=INNODB CHARSET=utf8;
import requests
import json
import threading
import MySQLdb
from queue import Queue
import logging
import re
from bs4 import BeautifulSoup
'''
@Author :王磊
@Date :2019/9/19
@Description:某网页数据爬取
'''
####################################################
################### 配置部分开始 ###################
####################################################
# 数据库账号
mysql_user = "root"
# 数据库密码
mysql_password = "root"
# 数据库名称
mysql_database = "house"
# 线程数(配置为电脑最大线程数)
threadNum = 8
####################################################
################### 配置部分结束 ###################
####################################################
headers = {
"PlatformId": "2bf1fffd-4ec6-46ba-a068-69d45c2b6017",
"PlatformType": "mobile",
"Referer": "http://m.jiazaishanghai.com/House/Search",
"Token": "",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"
}
featureList = {"-1": "不限", "1001": "暖气", "1003": "浴缸", "1004": "户外空间", "1007": "健身", "1008": "厨房", "1009": "空调", "1010": "精装", "1011": "独卫", "1012": "阳台", "1013": "WIFI"}
zuTypeList = {"-1": "不限", "0": "年租", "1": "短租"}
houseUseTypeList = {"-1": "不限", "206": "住宅", "201": "工作室", "202": "住宅/工作室", "203": "商铺", "204": "写字楼", "205": "厂房/仓库"}
livingTypeList = {"-1": "不限", "1001": "整租", "1002": "合租"}
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(name)s:%(levelname)s:%(message)s')
class jiazaiSpider(threading.Thread):
def __init__(self, pageQueue, *args, **kwargs):
super(jiazaiSpider, self).__init__(*args, **kwargs)
self.pageQueue = pageQueue
def log(self, msg):
'''
info
:param msg:
:return:
'''
logging.info(msg)
def getFeatures(self, features):
'''
获取设施基础列表
:param features:
:return:
'''
features_ = []
if features:
for feature in features:
try:
features_.append(featureList[str(feature)])
except Exception as e:
pass
return features_
def getPictureList(self, pictureList):
'''
获取轮播图列表
:param pictureList:
:return:
'''
pictureList_ = []
if pictureList:
for picture in pictureList:
try:
pictureList_.append("https://api.jiazaishanghai.com/Image/Get?p=" + picture['Url'])
except Exception as e:
continue
return pictureList_
def getZuType(self, zuType):
'''
获取出租类型
:param zuType:
:return:
'''
zuType_ = "长租"
if zuType:
try:
zuType_ = zuTypeList[str(zuType)]
except Exception as e:
pass
return zuType_
def getHouseUseType(self, useTypeId):
'''
获取房屋用途
:param useTypeId:
:return:
'''
houseUseType = None
if useTypeId:
try:
houseUseType = houseUseTypeList[str(useTypeId)]
except Exception as e:
pass
return houseUseType
def getLivingType(self, livingType):
'''
获取出租类型
:param livingType:
:return:
'''
livingType_ = None
if livingType:
try:
livingType_ = livingTypeList[str(livingType)]
except Exception as e:
pass
return livingType_
def getHouseDetail(self, houseId):
'''
解析房屋信息
:param houseId:
:return:
'''
house = {}
if houseId:
url = "https://api.jiazaishanghai.com/api/House/GetDetail?houseId=" + houseId + "&id=0"
try:
houseDetail = getHtml(url)['data']
try:
house['HouseId'] = houseDetail['house']['HouseId']
except Exception as e:
house['HouseId'] = ""
try:
house['CheckinTime'] = houseDetail['house']['CheckinTime']
except Exception as e:
house['CheckinTime'] = ""
try:
house['Title'] = houseDetail['house']['Title']
except Exception as e:
house['Title'] = ""
try:
house['Road'] = houseDetail['house']['Road']
except Exception as e:
house['Road'] = ""
try:
house['Nong'] = houseDetail['house']['Nong']
except Exception as e:
house['Nong'] = ""
try:
house['Hao'] = houseDetail['house']['Hao']
except Exception as e:
house['Hao'] = ""
try:
house['XiaoQu'] = houseDetail['house']['XiaoQu']
except Exception as e:
house['XiaoQu'] = ""
try:
house['Shi'] = houseDetail['house']['Shi']
except Exception as e:
house['Shi'] = ""
try:
house['Ting'] = houseDetail['house']['Ting']
except Exception as e:
house['Ting'] = ""
try:
house['Wei'] = houseDetail['house']['Wei']
except Exception as e:
house['Wei'] = ""
try:
house['AtFloor'] = houseDetail['house']['AtFloor']
except Exception as e:
house['AtFloor'] = ""
try:
house['TotalFloor'] = houseDetail['house']['TotalFloor']
except Exception as e:
house['TotalFloor'] = ""
try:
house['Area'] = houseDetail['house']['Area']
except Exception as e:
house['Area'] = ""
try:
house['IsElevator'] = houseDetail['house']['IsElevator']
except Exception as e:
house['IsElevator'] = ""
try:
house['ZuType'] = self.getZuType(houseDetail['house']['ZuType'])
except Exception as e:
house['ZuType'] = ""
try:
house['CoverImage'] = "https://api.jiazaishanghai.com/Image/Get?p=" + houseDetail['house']['CoverImage']
except Exception as e:
house['CoverImage'] = ""
try:
house['Price'] = houseDetail['house']['Price']
except Exception as e:
house['Price'] = ""
try:
house['PriceUnit'] = houseDetail['house']['PriceUnit']
except Exception as e:
house['PriceUnit'] = ""
try:
house['YjRatio'] = houseDetail['house']['YjRatio']
except Exception as e:
house['YjRatio'] = ""
try:
house['UseTypeId'] = self.getHouseUseType(houseDetail['house']['UseTypeId'])
except Exception as e:
house['UseTypeId'] = ""
try:
house['Linkman'] = houseDetail['house']['Linkman']
except Exception as e:
house['Linkman'] = ""
try:
house['Email'] = houseDetail['house']['Email']
except Exception as e:
house['Email'] = ""
try:
house['Phone'] = houseDetail['house']['Phone']
except Exception as e:
house['Phone'] = ""
try:
htmp = str(bytes(houseDetail['house']['Content'], encoding='utf-8').decode('utf-8').encode('gbk', 'ignore').decode('gbk'))
soup = BeautifulSoup(htmp, "html.parser")
ps = soup.find_all("p")
content = ""
for p in ps:
style = p.find_all("style")
if style:
continue
content += p.text + "\r\n"
if not content or content == "" or content == "\r\n":
req = re.compile(r'<.*?>', re.S)
content = req.sub('', htmp)
house['Content'] = content
except Exception as e:
house['Content'] = ""
try:
house['TrafficStation'] = houseDetail['house']['TrafficStation']
except Exception as e:
house['TrafficStation'] = ""
try:
house['LivingType'] = self.getLivingType(houseDetail['house']['LivingType'])
except Exception as e:
house['LivingType'] = ""
try:
house['IsAdditionalYj'] = houseDetail['house']['IsAdditionalYj']
except Exception as e:
house['IsAdditionalYj'] = ""
try:
house['GeYongAmount'] = houseDetail['house']['GeYongAmount']
except Exception as e:
house['GeYongAmount'] = ""
try:
house['Lng'] = houseDetail['house']['Lng']
except Exception as e:
house['Lng'] = ""
try:
house['Lat'] = houseDetail['house']['Lat']
except Exception as e:
house['Lat'] = ""
try:
house['features'] = str(self.getFeatures(houseDetail['features'])).replace("\'", "\"")
except Exception as e:
house['features'] = ""
try:
house['pictureList'] = str(self.getPictureList(houseDetail['pictureList'])).replace("\'", "\"")
except Exception as e:
house['pictureList'] = ""
try:
house['userType'] = houseDetail['userType']
except Exception as e:
house['userType'] = ""
except Exception as e:
return None
return house
def pipLine(self, house):
'''
储存
:param house:
:return:
'''
try:
conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, charset='utf8')
cursor = conn.cursor()
cursor.execute("insert into house_jiazai(HouseId, CheckinTime, Title, Road, Nong, Hao, XiaoQu, Shi, Ting, Wei, AtFloor, TotalFloor, Area, IsElevator, ZuType, CoverImage, Price, PriceUnit, YjRatio, UseTypeId, Linkman, Email, Phone, Content, TrafficStation, LivingType, IsAdditionalYj, GeYongAmount, Lng, Lat, features, pictureList, userType) "
"values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
(house['HouseId'], house['CheckinTime'], house['Title'], house['Road'], house['Nong'], house['Hao'], house['XiaoQu'], house['Shi'], house['Ting'], house['Wei'], house['AtFloor'], house['TotalFloor'], house['Area'], house['IsElevator'], house['ZuType'], house['CoverImage'], house['Price'], house['PriceUnit'], house['YjRatio'], house['UseTypeId'], house['Linkman'], house['Email'], house['Phone'], house['Content'], house['TrafficStation'], house['LivingType'], house['IsAdditionalYj'], house['GeYongAmount'], house['Lng'], house['Lat'], house['features'], house['pictureList'], house['userType']))
conn.commit()
except Exception as e:
logging.error(e)
pass
def run(self):
'''
run
:return:
'''
while True:
if self.pageQueue.empty():
break
page = self.pageQueue.get()
houseList = getHouseList(page)['list']
for house in houseList:
houseId = house['HouseId']
houseDetail = self.getHouseDetail(houseId)
if houseDetail:
self.log(houseDetail)
self.pipLine(houseDetail)
def getHtml(url):
while True:
try:
resp = requests.get(url, headers=headers, timeout=10)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
continue
def getHouseList(page):
'''
获取房屋列表
:param page:
:return:
'''
url = "https://api.jiazaishanghai.com/api/HouseNew/GetList?CityCode=1001&PageIndex=" + str(page) + "&PageSize=10&IsElevator=-1&keywords=&orderBy=LastUpdateTime%20desc&isCustomRange=false&customRangeValue%5B%5D=0&customRangeValue%5B%5D=100&houseBrandId="
houseList = getHtml(url)['data']
return houseList
def getTotalPage():
'''
获取总页数
:return:
'''
total = getHouseList(1)['total']
return total // 10 if total % 10 == 0 else (total // 10) + 1
def main():
totalPage = getTotalPage()
pageQueue = Queue(0)
for i in range(1, totalPage + 1):
pageQueue.put(i)
for j in range(threadNum):
j = jiazaiSpider(pageQueue)
j.start()
if __name__ == '__main__':
main()