第一个是爬去到家网的
import scrapy
import pymysql.cursors
import time
import datetime
class CQRealEstate2(scrapy.Spider):
name = "CQRealEstate2"
id = 14000
pg = 1
start_urls = ["https://cq.daojiale.com/ershoufang/pg1"]
def parse(self, response):
self.pg = self.pg + 1
url_list = response.xpath(
'//ul[contains(@class,"fylist")]/li//div[contains(@class,"fyimg")]//a/@href').extract()
print(url_list)
for url in url_list:
yield scrapy.Request(url, meta={'url': url}, callback=self.test)
if self.pg != 1800:
url = "https://cq.daojiale.com/ershoufang/pg" + str(self.pg)
yield scrapy.Request(url, callback=self.parse)
def test(self, response):
url = response.meta['url']
self.id = int(self.id) + 1
address = "";
title = response.xpath('//div[contains(@class,"conmaxtit")]//strong/text()').extract()
if title == []:
return
# print(title)
rmb = response.xpath('//div[contains(@class,"djnum")]//strong/text()').extract()
# print(rmb)
unitPrice = response.xpath('//div[contains(@class,"djnum")]//div//h3/text()').extract()
# print(unitPrice)
mainInfo = response.xpath('//ul[contains(@class,"jbinfo")]//li//strong/text()').extract()
details = mainInfo[0] + "-" + mainInfo[1] + "建面-" + mainInfo[2]
# print(mainInfo)
subInfo = response.xpath('//ul[contains(@class,"mtinfo")]//li//span/text()').extract()
details = details + "-" + subInfo[0] + "套内-" + subInfo[1] + "楼层" + subInfo[2]
# print(details)
info = response.xpath('//ul[contains(@class,"mtinfo")]//li//span//a/text()').extract()
for i in info:
address = address + i
address = address.replace(" ", "")
# print(address)
connection = self.db()
cursor = connection.cursor()
sql2 = "INSERT INTO `fang` (`id`, `titles`, `rmb`, `unitPrice`, `details`, `address`, `url`) VALUES ('" + str(
self.id) + "', '" + str(title[0]) + "', '" + str(rmb[0]) + "', '" + str(unitPrice[0]) + "', '" + str(
details) + "', '" + str(address) + "', '" + str(url) + "');"
cursor.execute(sql2)
connection.commit()
def db(self):
# 连接MySQL数据库
connection = pymysql.connect(host='172.21.92.116', port=3306, user='root', password='a123456a',
db='test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
return connection
第二个是链家网的
import scrapy
import pymysql.cursors
import time
import datetime
class CQRealEstate(scrapy.Spider):
name = "CQRealEstate"
id = 9000
pg = 1
start_urls = ["https://cq.lianjia.com/ershoufang/yuzhong/co32/?utm_source=baidu&utm_medium=ppc&utm_term=%E9%93%BE%E5%AE%B6%E5%9C%B0%E4%BA%A7%E9%87%8D%E5%BA%86&utm_content=%E5%93%81%E7%89%8C&utm_campaign=%E9%87%8D%E5%BA%86_%E5%95%86%E6%9C%BA%E5%B9%BF%E6%B3%9B"]
# start_urls = ["https://cq.lianjia.com/ershoufang/106102400727.html"]
def parse(self, response):
self.pg =self.pg + 1
url_list = response.xpath(
'//div[contains(@class,"info clear")]//div[contains(@class,"title")]//a/@href').extract()
for url in url_list:
yield scrapy.Request(url, callback=self.test)
if self.pg != 100:
url = "https://cq.lianjia.com/ershoufang/yuzhong/pg" + str(self.pg) + "co32/?utm_source=baidu&utm_medium=ppc&utm_term=链家地产重庆&utm_content=品牌&utm_campaign=重庆_商机广泛";
yield scrapy.Request(url, callback=self.parse)
def test(self, response):
self.id = int(self.id) + 1
titles = ""
details = ""
title = response.xpath('//h1[contains(@class,"main")]/text()').extract()
if title == []:
return
titles = "标题:" + title[0]
rmb = response.xpath('//span[contains(@class,"total")]/text()').extract()
titles = titles + " 价格:" + rmb[0] + "万 "
rmb1 = response.xpath('//span[contains(@class,"unitPriceValue")]/text()').extract()
titles = titles + rmb1[0] + ":元/平米"
mainInfo = response.xpath('//div[contains(@class,"mainInfo")]/text()').extract()
details = mainInfo[0] + " " + mainInfo[1] + " " + mainInfo[2]
subInfo = response.xpath('//div[contains(@class,"subInfo")]/text()').extract()
details = details + " " + subInfo[0] + " " + subInfo[1] + " " + subInfo[2]
# print(details)
info = response.xpath('//div[contains(@class,"areaName")]//text()').extract()
address = ""
for i in info:
address = address + i
address = address.replace(" ", "")
address = address.replace("所在区域", "")
name = str(title[0].replace(" ", ""))
name1 = str(rmb[0])
name2 = str(rmb1[0].replace(" ", ""))
name3 = str(details.replace(" ", "-"))
name4 = str(address.replace(" ", ""))
connection = self.db()
cursor = connection.cursor()
sql2 = "INSERT INTO `fang` (`id`, `titles`, `rmb`, `unitPrice`, `details`, `address`) VALUES ('" + str(
self.id) + "', '" + name + "', '" + name1 + "', '" + name2 + "', '" + name3 + "', '" + name4 + "');"
cursor.execute(sql2)
connection.commit()
def db(self):
# 连接MySQL数据库
connection = pymysql.connect(host='172.21.92.116', port=3306, user='root', password='a123456a',
db='test', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
return connection