1.使用mysql创建lianjiaershoufang的数据库
2.创建chengjiao table,属性如下:
+---------------+--------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------------+--------------+------+-----+---------+-------+
| id | char(60) | NO | PRI | NULL | |
| qu | varchar(20) | YES | | NULL | |
| zhen | varchar(30) | YES | | NULL | |
| xiaoquName | varchar(100) | YES | | NULL | |
| xiaoquYear | varchar(60) | YES | | NULL | |
| title | varchar(200) | YES | | NULL | |
| houseInfo | varchar(200) | YES | | NULL | |
| dealDate | varchar(60) | YES | | NULL | |
| totalPrice | varchar(20) | YES | | NULL | |
| positionInfo | varchar(60) | YES | | NULL | |
| unitPrice | varchar(20) | YES | | NULL | |
| dealHouseInfo | varchar(60) | YES | | NULL | |
| postPrice | varchar(20) | YES | | NULL | |
| dealCycle | varchar(10) | YES | | NULL | |
+---------------+--------------+------+-----+---------+-------+
3.爬取数据将数据一条一条导入数据库
import asyncio
import aiohttp
from lxml import etree
import logging
import datetime
import openpyxl
import nest_asyncio
nest_asyncio.apply()
from bs4 import BeautifulSoup
import re
import pymysql
import time
import random
class Spider(object):
def __init__(self):
self.semaphore = asyncio.Semaphore(1) # 信号量,控制协程数,防止爬的过快被反爬
self.header = {
"Cookie": "填写自己的浏览器cookie",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
self.flag = 0
# 请求指定url数据, 返回 HTML 字符串
async def request_data(self, url):
async with self.semaphore:
try:
session = aiohttp.ClientSession(headers=self.header)
response = await session.get(url)
result = await response.text()
await session.close()
except Exception as e:
print("请求地址%s failed" % url)
result = None
return result
# 根据镇名字,获取所有小区
async def get_all_xiaoqu_from_zhen(self, qu, zhen):
url = f'https://sh.lianjia.com/xiaoqu/%s/' % zhen
html_content = await self.request_data(url)
# 使用BeautifulSoup解析HTML
if html_content == None:
return
soup = BeautifulSoup(html_content, 'lxml')
# 找到所有class为'info'的元素
info_divs = soup.find_all('h2', class_='total fl')
#print(info_divs)
span_tag = soup.find('h2', class_='total fl').find('span')
#获取小区的数量
xiaoqu_number = int(span_tag.text.strip())
page_number = 0 if (xiaoqu_number%30==0) else 1
page_number = page_number + xiaoqu_number // 30
print(">>> 区:%s, %s镇,小区数量:%d, totalPage:%d" % (qu, zhen, xiaoqu_number, page_number))
for pg in range(1, page_number+1):
print(">>>> 访问区:%s, 镇:%s, %d/%d 页" % (qu, zhen, pg, page_number))
await self.get_one_page_xiaoqu(qu, zhen, pg)
# 根据qu和page号码,获取一个page所有小区的数据
async def get_one_page_xiaoqu(self, qu, zhen, pg):
url = f'https://sh.lianjia.com/xiaoqu/%s/' % zhen
if pg > 1:
url += "pg%s/" % pg
try:
print(">>>> 访问一页小区:%s" % url)
html_text = await self.request_data(url)
except Exception as e:
print(">>>> request Data fail!")
return
if (html_text) == None:
return
soup = BeautifulSoup(html_text, 'lxml')
info_divs = soup.find_all('li', class_='clear xiaoquListItem')
for xiqoqu in info_divs:
xiaoqu_id = xiqoqu['data-id']
xiaoqu_name = xiqoqu.find('div', class_='title').get_text(strip=True)
xiaoqu_year = xiqoqu.find('div', class_='positionInfo').get_text(strip=True).split('/')[-1].strip()
if await self.get_one_xiaoqu(qu, zhen, xiaoqu_id, xiaoqu_name, xiaoqu_year) == False:
return False
async def get_all_qu(self):
Qu = ['pudong', 'minhang', 'baoshan', 'xuhui', 'putuo', 'yangpu', 'changning', 'songjiang', 'jiading', 'huangpu', 'jingan', 'hongkou', 'qingpu', 'fengxian', 'jinshan', 'chongming']
while True:
for qu in Qu:
print("> 开始获取 %s 区数据" % qu)
await self.get_all_zhen_from_qu(qu)
print("> 结束获取 %s 区数据>" % qu)
async def get_one_xiaoqu(self, qu, zhen, xiaoqu_id, xiaoqu_name, xiaoqu_year):
url = f'https://sh.lianjia.com/chengjiao/c%s/' % xiaoqu_id
html_text = await self.request_data(url)
if html_text == None:
return
soup = BeautifulSoup(html_text, 'lxml')
info_divs = soup.find_all('div', class_='total fl')
span_tag = soup.find('div', class_='total fl').find('span')
fangyuan_number = int(span_tag.text.strip())
page_number = 0 if (fangyuan_number%30==0) else 1
page_number = page_number + fangyuan_number // 30
print(">>>>> 小区:%s,成交数量:%d, page数量:%d" % (xiaoqu_name, fangyuan_number, page_number))
for pg in range(1, page_number+1):
print(">>>>>> 小区:%s, 第%d页/总%d页" % (xiaoqu_name, pg, page_number))
if await self.get_xiaoqu_one_page_fangyuan(qu, zhen, xiaoqu_id, xiaoqu_name, xiaoqu_year, pg) == False:
return False
async def get_xiaoqu_one_page_fangyuan(self, qu, zhen, xiaoqu_id, xiaoqu_name, xiaoqu_year, pg):
url = f'https://sh.lianjia.com/chengjiao/c%s/' % xiaoqu_id
if pg > 1:
url += "pg%s/" % pg
print(">>>>>> 区:%s, 小区:%s, url:%s" % (qu, xiaoqu_name, url))
html_text = await self.request_data(url)
if html_text == None:
return
soup = BeautifulSoup(html_text, 'lxml')
info_divs = soup.find_all('div', class_='info')
result_list = []
conn = pymysql.connect(host='localhost', user='root', password='123456', db='lianjiaershoufang')
cursor = conn.cursor()
index = 0
delay = random.uniform(0.01, 0.2)
time.sleep(delay)
for info_div in info_divs:
try:
# 创建一个字典来存储子元素的内容
info_dict = {}
info_dict['qu'] = qu
info_dict['xiaoquName'] = xiaoqu_name
info_dict['xiaoquYear'] = xiaoqu_year
# 提取子元素
title_div = info_div.find('div', class_='title')
info_dict['title'] = title_div.text.strip() if title_div else None
# 提取子元素
address_div = info_div.find('div', class_='address')
houseInfo = address_div.find('div', class_='houseInfo')
info_dict['houseInfo'] = houseInfo.text.strip() if houseInfo else None
dealDate = address_div.find('div', class_='dealDate')
info_dict['dealDate'] = dealDate.text.strip() if houseInfo else None
totalPrice = address_div.find('div', class_='totalPrice')
number = totalPrice.find('span', class_='number')
info_dict['totalPrice'] = number.text.strip() if number else None
flood_div = info_div.find('div', class_='flood')
positionInfo = flood_div.find('div', class_='positionInfo')
info_dict['positionInfo'] = positionInfo.text.strip() if positionInfo else None
unitPrice = flood_div.find('div', class_='unitPrice')
number = unitPrice.find('span', class_='number')
info_dict['unitPrice'] = number.text.strip() if unitPrice else None
# 提取子元素
deal_house_info_div = info_div.find('div', class_='dealHouseInfo')
info_dict['dealHouseInfo'] = deal_house_info_div.text.strip() if deal_house_info_div else None
# 提取子元素
deal_cycle_info_div = info_div.find('div', class_='dealCycleeInfo')
deal_cycle_str = deal_cycle_info_div.text.strip() if deal_cycle_info_div else None
# 提取挂牌价
listing_price = re.search(r'挂牌(\d+)万', deal_cycle_str)
if listing_price:
listing_price = listing_price.group(1)
info_dict['postPrice'] = listing_price
# 提取成交周期
transaction_period = re.search(r'成交周期(\d+)天', deal_cycle_str)
if transaction_period:
transaction_period = transaction_period.group(1)
info_dict['dealCycle'] = transaction_period
info_dict['id'] = xiaoqu_id + "-" + info_dict['dealDate'] + "-" +info_dict['unitPrice'] + "-" + info_dict['totalPrice']
info_dict['zhen'] = zhen
result_list.append(info_dict)
sql = "INSERT INTO chengjiao (id, qu, zhen, xiaoquName, xiaoquYear, title, houseInfo, dealDate, totalPrice, positionInfo, unitPrice, dealHouseInfo, postPrice, dealCycle) VALUES "
sql += "('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s');" % (info_dict['id'], info_dict['qu'], info_dict['zhen'], info_dict['xiaoquName'], info_dict['xiaoquYear'], info_dict['title'], info_dict['houseInfo'], info_dict['dealDate'], info_dict['totalPrice'], info_dict['positionInfo'], info_dict['unitPrice'], info_dict['dealHouseInfo'], info_dict['postPrice'], info_dict['dealCycle'])
except Exception as e:
print(">>>>>> 解析错误!")
continue
try:
cursor.execute(sql)
except Exception as e:
print(">>>>>> 小区:%s 已存在!!" % xiaoqu_name)
conn.commit()
cursor.close()
conn.close()
return False
index = index + 1
print("小区:%s, 插入:%d 条数据"%(xiaoqu_name, index))
conn.commit()
cursor.close()
conn.close()
# 根据区名,获取所有的镇,
async def get_all_zhen_from_qu(self, qu):
url = f'https://sh.lianjia.com/xiaoqu/%s/' % qu
html_content = await self.request_data(url)
if html_content == None:
return
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'lxml')
div_ershoufang = soup.find('div', {'data-role': 'ershoufang'})
if div_ershoufang:
div_list = div_ershoufang.find_all('div')
# 如果至少有两个标签,提取第二个标签内的标签内容
if len(div_list) >= 2:
second_div = div_list[1]
a_tags = second_div.find_all('a', href=True)
# 提取第二个