python爬虫:requests+BeautifulSoup+MySQLdb

1、需求描述:爬去链家网的租房信息,并将爬去结果存储到mysql数据库中;

    爬取link:https://sh.lianjia.com/zufang/

2、python爬虫准备工作:

  • 安装python3.6:网上下载安装包,傻瓜式安装,这里不再详细介绍;
  • 安装requests库:pip install requests;
  • 安装bs4库:pip install bs4;
  • 安装PyMySQL库(针对python3):pip install PyMySQL;
  • 编程工具:sublime3;

ps:python2.7安装mysql:pip install MySQL-python;

3、mysql数据表结构:

CREATE TABLE `lianjian_house` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `price` float(255,0) NOT NULL,
  `unit` varchar(255) CHARACTER SET utf8mb4 DEFAULT NULL,
  `decoration` varchar(255) DEFAULT NULL,
  `create_time` varchar(255) DEFAULT NULL,
  `subway` varchar(255) DEFAULT NULL,
  `area` varchar(255) DEFAULT NULL,
  `layout` varchar(255) DEFAULT NULL,
  `direction` varchar(255) DEFAULT NULL,
  `community` varchar(255) DEFAULT NULL,
  `location` varchar(255) DEFAULT NULL,
  `agent_name` varchar(255) DEFAULT NULL,
  `agent_id` int(20) DEFAULT NULL,
  `score` varchar(255) DEFAULT NULL,
  `number` varchar(255) DEFAULT NULL,
  `times` int(20) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=gbk;

4、爬虫源码:

import time
import pymysql
import requests
from bs4 import BeautifulSoup

#获取页面信息
def get_page(url):
   response = requests.get(url)
   soup = BeautifulSoup(response.text, 'lxml')
   return soup

#获取页面内每个租房信息的url
def get_links(url):
   soup = get_page(url)
   links_div = soup.find_all('div', class_ = 'pic-panel')
   links = [div.a.get('href') for div in links_div]
   return links

#获取每个租房的详细信息
def get_house_info(url):      
   soup = get_page(url)
   price = soup.find('span', class_='total').text
   #print (price)
   unit = soup.find('span', class_='unit').text.strip()
   #print (unit)
   house_info = soup.find_all('p')
   #print (house_info)
   area = house_info[0].text[3:]
   #print (area)
   layout = house_info[1].text[5:]
   #print (layout)
   floor = house_info[2].text[3:]
   #print (floor)
   direction = house_info[3].text[5:]
   #print (direction)
   subway = house_info[4].text[3:]
   #print (subway)
   community = house_info[5].text[3:]
   #print (community)
   location = house_info[6].text[3:]
   #print (location)
   create_time = house_info[7].text[3:]
   #print (create_time)

   agent = soup.find('a', class_='name LOGCLICK')
   #print (agent)
   agent_name = agent.text
   agent_id = agent.get('data-el')
   #print (agent_name, agent_id)

   evaluate = soup.find('div', class_='evaluate')
   #print (evaluate)

   times = evaluate.find('span', class_='time').text[5:-1]
   #print (times)

   info = {
      '价格':price,
      '单位':unit,
      '面积':area,
      '户型':layout,
      '朝向':direction,
      '发布时间':create_time,
      '地铁':subway,
      '小区':community,
      '位置':location,
      '经纪人名字':agent_name,
      '经纪人id':agent_id,
      '查看次数':times
      }
   return info

#将读取到的租房信息插入到数据库中
def insert(db, house):
   values = "'{}',"*11+"'{}'"
   sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'],house['朝向'],
                     house['发布时间'],house['地铁'],house['小区'],house['位置'],house['经纪人名字'],
                     house['经纪人id'],house['查看次数'])
   sql ="insert into lianjian_house(price,unit,area,layout,direction,create_time,subway,community,location,agent_name,agent_id,times) values({})".format(sql_values)
   cursor = db.cursor()
   cursor.execute(sql)
   db.commit()



db = pymysql.connect(host='localhost', user='root', password='manmanw68', db='wanmen_db',charset='utf8mb4')

url = 'https://sh.lianjia.com/zufang/'
links = get_links(url)

for link in links:
   time.sleep(1)
   print ('获取一个房子信息成功!')
   house = get_house_info(link)
   print (house, end='\r')
   insert(db, house)


你可能感兴趣的:(python之爬虫)