import time
import requests
import random
import pymysql
from lxml import etree
class AnJuKe():
# 初始化
def __init__(self, url):
self.connect = pymysql.connect(
host = 'localhost',
db = 'pachong',
user = 'root',
password = '12345'
)
self.cursor = self.connect.cursor()#创建游标
self.tree = self.get_tree(url)
self.result_city()
# 判断是否为空
def is_empty(self,data):
if data:
data = data[0]
else:
data = '无信息'
return data
# 得到tree
def get_tree(self, url):
# 代理ip
proxies_list = [{'http': 'http://117.191.11.111:8080'},
{'http': 'http://118.25.104.254:1080'},
{'http': 'http://203.195.168.154:3128'},
{'http': 'http://117.191.11.75:80'},
{'http': 'http://117.191.11.72:80'},]
proxies = random.choice(proxies_list)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'}
response = requests.get(url, headers = headers, proxies = proxies).text
# print(response)
tree = etree.HTML(response)
return tree
# 获取城市详情
def result_city(self):
# 城市列表
city_list_class = self.tree.xpath('//div[@class="letter_city"]/ul/li[position()>12 and position()<18]/div/a/@href')
# print(city_list_class)
for city_url in city_list_class:
# print(city_url)
city_url1 = city_url.split('.')[0]
# print(city_url1)
for i in range(1,51):
# print(1)
url = '%s.zu.anjuke.com/fangyuan/p%s/' % (city_url1,i)
# 调用get函数
tree = self.get_tree(url)
room_list_class = tree.xpath(r'//div[@class="zu-itemmod"]')
# 所有N-S循环
for room_list in room_list_class:
# print(1)
# 主题
title = room_list.xpath(r'./div[@class="zu-info"]/h3/a/b/text()')
title = self.is_empty(title)
title = self.zhuanma(title)
# print('1',title)
# 图片
image = room_list.xpath(r'./a[@class="img"]//img[@class="thumbnail"]/@lazy_src')
image = self.is_empty(image)
# print('2',image)
# 几室
bedroom_num = room_list.xpath(r'./div[@class="zu-info"]/p[1]/b[1]/text()')
bedroom_num = self.is_empty(bedroom_num)
bedroom_num = self.zhuanma(bedroom_num)
# print('3',bedroom_num)
# 几厅
living_room_num = room_list.xpath(r'./div[@class="zu-info"]/p[1]/b[2]/text()')
living_room_num = self.is_empty(living_room_num)
living_room_num = self.zhuanma(living_room_num)
# print('0',living_room_num)
# 面积
area = room_list.xpath(r'./div[@class="zu-info"]/p[1]/b[3]/text()')
area = self.is_empty(area)
area = self.zhuanma(area)
# print('4',area)
# 楼层情况
floor = room_list.xpath('./div[@class="zu-info"]/p/text()')[4].split('共')[0].replace('(','')
# floor = self.is_empty(floor)
# print('5',floor)
# 总楼层
floors = room_list.xpath('./div[@class="zu-info"]/p/text()')[4].split('共')[1].replace(')','')
# floors = self.is_empty(floors)
# print('6',floors)
# 经纪人
agent = room_list.xpath('./div[@class="zu-info"]/p/text()')[5]
# agent = self.is_empty(agent)
# print('7',agent)
# 小区名
neighborhood = room_list.xpath('./div[@class="zu-info"]/address[@class="details-item"]/a/text()')[0]
# neighborhood = self.is_empty(neighborhood)
# print('8',neighborhood)
# 地址
address = room_list.xpath('./div[@class="zu-info"]/address[@class="details-item"]/text()')[1]
addres = address.strip()
# address = self.is_empty(address)
# print('9',address)
# 租房方式
rent_way = room_list.xpath('./div[@class="zu-info"]/p[@class="details-item bot-tag"]/span[1]/text()')
rent_way = self.is_empty(rent_way)
# print('10',rent_way)
# 朝向
face_direction = room_list.xpath('./div[@class="zu-info"]/p[@class="details-item bot-tag"]/span[2]/text()')
face_direction = self.is_empty(face_direction)
# print('11',face_direction)
#地铁
subline = room_list.xpath('./div[@class="zu-info"]/p[@class="details-item bot-tag"]/span[@class="cls-4"]/text()')
subline = self.is_empty(subline)
# print('12',subline)
# 价格
price = room_list.xpath('./div[@class="zu-side"]/p/strong/b/text()')[0]
# price = self.is_empty(price)
price = self.zhuanma(price)
# print('13',price)
self.save_mysql(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,addres,rent_way,face_direction,subline,price)
time.sleep(120)
#保存数据库
def save_mysql(self,title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,addres,rent_way,face_direction,subline,price):
sql = 'insert into anjuke(title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,addres,rent_way,face_direction,subline,price) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
self.cursor.execute(sql, (title,image,bedroom_num,living_room_num,area,floor,floors,agent,neighborhood,addres,rent_way,face_direction,subline,price))
self.connect.commit()
print('数据插入成功')
# except:
# print('数据插入失败')
# 自创字符转码
def zhuanma(self,mm):
str1 = ''
dicts = {'驋': '1', '餼': '2', '龤': '3', '麣': '4', '鑶': '5', '齤': '6', '鸺': '7', '閏': '8', '龥': '9', '龒': '0','.':'.'}
for i in mm:
if i in dicts:
ss = dicts[i]
str1 += ss
else:
str1 += i
return str1
if __name__ == '__main__':
url = 'https://www.anjuke.com/sy-city.html'
AnJuKe(url)