1、需求描述:爬去链家网的租房信息,并将爬去结果存储到mysql数据库中;
爬取link:https://sh.lianjia.com/zufang/
2、python爬虫准备工作:
ps:python2.7安装mysql:pip install MySQL-python;
3、mysql数据表结构:
CREATE TABLE `lianjian_house` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`price` float(255,0) NOT NULL,
`unit` varchar(255) CHARACTER SET utf8mb4 DEFAULT NULL,
`decoration` varchar(255) DEFAULT NULL,
`create_time` varchar(255) DEFAULT NULL,
`subway` varchar(255) DEFAULT NULL,
`area` varchar(255) DEFAULT NULL,
`layout` varchar(255) DEFAULT NULL,
`direction` varchar(255) DEFAULT NULL,
`community` varchar(255) DEFAULT NULL,
`location` varchar(255) DEFAULT NULL,
`agent_name` varchar(255) DEFAULT NULL,
`agent_id` int(20) DEFAULT NULL,
`score` varchar(255) DEFAULT NULL,
`number` varchar(255) DEFAULT NULL,
`times` int(20) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=gbk;
4、爬虫源码:
import time import pymysql import requests from bs4 import BeautifulSoup
#获取页面信息 def get_page(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') return soup
#获取页面内每个租房信息的url def get_links(url): soup = get_page(url) links_div = soup.find_all('div', class_ = 'pic-panel') links = [div.a.get('href') for div in links_div] return links
#获取每个租房的详细信息 def get_house_info(url): soup = get_page(url) price = soup.find('span', class_='total').text #print (price) unit = soup.find('span', class_='unit').text.strip() #print (unit) house_info = soup.find_all('p') #print (house_info) area = house_info[0].text[3:] #print (area) layout = house_info[1].text[5:] #print (layout) floor = house_info[2].text[3:] #print (floor) direction = house_info[3].text[5:] #print (direction) subway = house_info[4].text[3:] #print (subway) community = house_info[5].text[3:] #print (community) location = house_info[6].text[3:] #print (location) create_time = house_info[7].text[3:] #print (create_time) agent = soup.find('a', class_='name LOGCLICK') #print (agent) agent_name = agent.text agent_id = agent.get('data-el') #print (agent_name, agent_id) evaluate = soup.find('div', class_='evaluate') #print (evaluate) times = evaluate.find('span', class_='time').text[5:-1] #print (times) info = { '价格':price, '单位':unit, '面积':area, '户型':layout, '朝向':direction, '发布时间':create_time, '地铁':subway, '小区':community, '位置':location, '经纪人名字':agent_name, '经纪人id':agent_id, '查看次数':times } return info
#将读取到的租房信息插入到数据库中 def insert(db, house): values = "'{}',"*11+"'{}'" sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'],house['朝向'], house['发布时间'],house['地铁'],house['小区'],house['位置'],house['经纪人名字'], house['经纪人id'],house['查看次数']) sql ="insert into lianjian_house(price,unit,area,layout,direction,create_time,subway,community,location,agent_name,agent_id,times) values({})".format(sql_values) cursor = db.cursor() cursor.execute(sql) db.commit()
db = pymysql.connect(host='localhost', user='root', password='manmanw68', db='wanmen_db',charset='utf8mb4') url = 'https://sh.lianjia.com/zufang/' links = get_links(url) for link in links: time.sleep(1) print ('获取一个房子信息成功!') house = get_house_info(link) print (house, end='\r') insert(db, house)