Python 爬虫小练 获取贝壳网数据
使用到的模块
标准库
Python3 标准库列表
logging
本身定义) 并定义在 logging
或 logging.handlers
中声明的处理器。StreamHandler
, FileHandler
和 NullHandler
) 实际上是在 logging
模块本身定义的,但其文档与其他处理程序一同记录在此。第三方库
requests 库: Python requests 是一个常用的 HTTP 请求库,可以方便地向网站发送 HTTP 请求,并获取响应结果。
requests 模块比urllib模块更简洁。官网地址:Python requests
BeautifulSoup 库:是一个可以从HTML或XML文件中提取数据的Python库。官网地址:BeautifulSoup
使用到的相关逻辑步骤
请求URL
模拟浏览器
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
}
URL编码
import urllib.parse
baseUrl = "https://nj.ke.com/ershoufang/"
url = baseUrl + "天润城/"
encoded_url = urllib.parse.quote(url, safe='/:?+=')
无用户认证
response = requests.get(encoded_url, headers=headers)
有用户认证(cookie)
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': 'lianjia_token=自己的具体值'
}
response = requests.get(encoded_url, headers=headers)
代理,公司内部若存在代理需要配置。
proxies = {"https": "http://111:8080"}
response = requests.get(encoded_url, headers=headers, proxies=proxies)
解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
取属性
soup.select('.title a')[0].attrs.get('href')
取标签值
soup.select(".total span")[0].text.strip()
下载图片资源
# urllib.request配置代理
urllib.request.install_opener(
urllib.request.build_opener(
urllib.request.ProxyHandler(proxies)
)
)
urllib.request.urlretrieve(housingImgUrl,housingTypeImagePath)
分析数据
写入SQLite 3数据库
建表(执行脚本)
写入
异常处理
conn = sqlite3.connect('../db/identifier.sqlite', check_same_thread=False)
c = conn.cursor()
# 执行sql脚本
with open('../db/script/house_listing_price.sql') as sql_file:
c.executescript(sql_file.read())
conn.commit()
for house_info in house_info_list:
sql = f'insert into house_listing_price values (' \
f'"{house_info["houseid"]}"' \
f',"{house_info["title"]}"' \
f',"{house_info["price"]}"' \
f',"{house_info["address"]}"' \
f',"{house_info["area"]}"' \
f',"{house_info["sealDate"]}"' \
f',"{house_info["housingType"]}"' \
f',"{house_info["houseUrl"]}")'
try:
c.execute("BEGIN")
c.execute(sql)
c.execute("COMMIT")
except:
print("[" + str(datetime.datetime.now()) + "] " + "写入数据库异常,sql is [" + sql + "]")
c.execute("ROLLBACK")
conn.commit()
conn.close()
完整示例
import requests
from bs4 import BeautifulSoup
import math
import datetime
import sqlite3
import urllib.request
import os
# 代理-公司用
proxies = {"https": "http://xzproxy.cnsuning.com:8080"}
# 无代理
# proxies = {}
# 下载图片第三方配置代理
urllib.request.install_opener(
urllib.request.build_opener(
urllib.request.ProxyHandler(proxies)
)
)
# 模拟浏览器请求的header
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
}
# 挂牌列表URL-不分页
url = "https://nj.ke.com/ershoufang/co22l2rs%E5%A4%A9%E6%B6%A6%E5%9F%8E%E5%8D%81%E5%9B%9B%E8%A1%97%E5%8C%BA/"
response = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text, 'html.parser')
# 网站每页30条
everypagecount = 30
sumhouse = soup.select(".total span")[0].text.strip()
pagesum = int(sumhouse) / everypagecount
pagesum = math.ceil(pagesum)
# 网站只提供100页
pagesum = min(pagesum, 100)
print("[" + str(datetime.datetime.now()) + "] " + "总记录数" + str(sumhouse) + ",总页数" + str(pagesum))
# 创建一个空列表,用于存储房源信息
house_info_list = []
# 请求房源列表数据
def requestUrl(real_url):
response = requests.get(real_url, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取房源列表数据
house_list = soup.select('.sellListContent li .clear')
# 循环遍历房源列表,提取所需信息
for house in house_list:
# 挂牌标题
title = house.select('.title a')[0].text.strip()
# 挂牌价格
price = house.select('.totalPrice span')[0].text.strip()
# 地址小区名称
address = house.select('.positionInfo a')[0].text.strip()
# 楼层简述
area = house.select('.houseInfo')[0].text.strip().replace(
'\n', '').replace(' ', '').split('|')[0]
area = area[0:area.index(')') + 1]
# 房屋登记编号
houseId = house.select('.unitPrice')[0].attrs.get('data-hid')
# 房源详情页的URL
href = house.select('.title a')[0].attrs.get('href')
response2 = requests.get(href, headers=headers, proxies=proxies)
soup2 = BeautifulSoup(response2.text, 'html.parser')
# 挂牌日期
sealDate = soup2.select('.introContent .transaction li')[0].text.strip()[4:]
# 户型
housingType = soup2.select('.introContent .base .content li')[0].text.strip()[4:].strip()
# 房屋图片列表
house_images_list = soup2.select('.thumbnail .smallpic li')
housingTypeImagePath = "../src/main/resources/images/housingType/" + houseId + ".jpg"
for house_images in house_images_list:
# 下载户型图
if "户型图" == house_images.attrs.get("data-desc") and not os.path.exists(housingTypeImagePath):
housingImgUrl = house_images.attrs.get("data-src")
urllib.request.urlretrieve(
housingImgUrl,
housingTypeImagePath)
# 将提取到的信息添加到房源信息列表中
house_info_list.append({
'title': title,
'price': price,
'address': address,
'area': area,
'houseid': houseId,
'sealDate': sealDate,
'housingType': housingType,
'houseUrl': href
})
return
pageNo = 0
while pageNo < pagesum:
currentPageNo = str(pageNo + 1)
# 挂牌列表URL-分页
url = 'https://nj.ke.com/ershoufang/pg' + currentPageNo + 'co22l2rs%E5%A4%A9%E6%B6%A6%E5%9F%8E%E5%8D%81%E5%9B%9B%E8%A1%97%E5%8C%BA/'
print("[" + str(datetime.datetime.now()) + "] " + "获取第" + currentPageNo + "页")
requestUrl(url)
pageNo = pageNo + 1
# 将房源信息列表保存为CSV文件
import csv
# print("写入文件中")
# current_date = datetime.datetime.now()
# formatted_date = current_date.strftime("%Y-%m-%d")
# filename = "house_info-" + formatted_date + ".csv"
# with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
# writer = csv.writer(f)
# writer.writerow(['标题', '价格', '地址', '位置', '房屋ID'])
# for house_info in house_info_list:
# writer.writerow([
# house_info['title'], house_info['price'], house_info['address'],
# house_info['area'], house_info['houseid']
# ])
# print("写入完成")
print("[" + str(datetime.datetime.now()) + "] " + "写入数据库")
conn = sqlite3.connect('../db/identifier.sqlite', check_same_thread=False)
c = conn.cursor()
# 执行sql脚本
with open('../db/script/house_listing_price.sql') as sql_file:
c.executescript(sql_file.read())
conn.commit()
for house_info in house_info_list:
sql = f'insert into house_listing_price values (' \
f'"{house_info["houseid"]}"' \
f',"{house_info["title"]}"' \
f',"{house_info["price"]}"' \
f',"{house_info["address"]}"' \
f',"{house_info["area"]}"' \
f',"{house_info["sealDate"]}"' \
f',"{house_info["housingType"]}"' \
f',"{house_info["houseUrl"]}")'
try:
c.execute("BEGIN")
c.execute(sql)
c.execute("COMMIT")
except:
print("[" + str(datetime.datetime.now()) + "] " + "写入数据库异常,sql is [" + sql + "]")
c.execute("ROLLBACK")
conn.commit()
conn.close()
print("[" + str(datetime.datetime.now()) + "] " + "写入完成")