这里说明下不要用xpath进行爬取,因为会被封。
# 引入包
import requests
from bs4 import BeautifulSoup
import time
import csv
# 定制请求头 换成自己的请求头
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
}
# 输出查询信息
chaxun = input('请输入要查询的城市:')
# 将要访问的网址
link = 'https://'+chaxun+'.anjuke.com/sale/'
# 访问该网站
r = requests.get(link, headers=headers, timeout=100)
# 使用BeautifulSoup提取html中的内容
soup = BeautifulSoup(r.text, 'lxml')
house_list = soup.find_all('li', class_="list-item")
# 将爬取的内容写入 test.csv中,编码格式为 'UTF-8'
with open('test.csv', 'a', encoding='UTF-8', newline='') as csvfile:
w = csv.writer(csvfile)
for house in house_list:
temp = []
name = house.find('div', class_="house-title").a.text.strip()
price = house.find('span', class_='price-det').text.strip()
price_area = house.find('span', class_='unit-price').text.strip()
no_room = house.find('div', class_='details-item').span.text
area = house.find('div', class_='details-item').contents[3].text
floor = house.find('div', class_='details-item').contents[5].text
year = house.find('div', class_='details-item').contents[7].text
broker = house.find('span', class_='brokername').text
broker = broker[1:]
address = house.find('span', class_='comm-address').text.strip()
address = address.replace('\xa0\xa0\n ', ' ')
tag_list = house.find_all('span', class_='item-tags')
tags = [i.text for i in tag_list]
temp = [name, price, price_area, no_room, area,
floor, year, broker, address, tags]
print(temp)
# 写入表格(test.csv)
w.writerow(temp)