python爬虫——爬取链家网北京租房信息

爬取链家网站北京租房信息

# 链家北京市租房信息,并导入本地数据库

import requests
import re
import pymysql

db = pymysql.connect('localhost', 'root', '126315', 'petzhang')
cursor = db.cursor()
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36'}


def getdata(n):
    first_url = 'https://bj.lianjia.com/zufang/chaoyang/pg{}'.format(n)
    response = requests.get(first_url, headers=headers)
    # print(response.text)

    # 市区(朝阳、海淀等等)
    loc_data1 = re.findall(r', response.text)
    loca1 = []
    for m in range(len(loc_data1)):
        locdata1 = re.findall(r'[\u4e00-\u9fa5]+', loc_data1[m])
        locdata1 = ''.join(locdata1)
        loca1.append((locdata1))

    # 应该是办事处吧
    loca2 = re.findall(r'target="_blank">(.*?)--', response.text)
    # print(detail_url)
    url1 = []
    title1 = []
    price1=[]
    method1=[]
    leixing1=[]
    square1=[]
    chaoxiang1=[]
    ruzhushijian1=[]
    louceng1=[]
    dianti1=[]
    yongshui1=[]
    yongdian1=[]
    ranqi1=[]
    cainuan1=[]
    zuqi1=[]
    agent1=[]
    phone1=[]
    for i in range(len(detail_url)):
        detailurl = 'https://bj.lianjia.com/zufang/BJ{}'.format(detail_url[i])
        detail_data = requests.get(detailurl, headers=headers)
        #网址
        url = detailurl
        url1.append(url)
        #房源标题
        title = re.findall(r'

(.*?)

'
, detail_data.text) title1.append(title) #价格 price=re.findall(r'(.*?)元/月', detail_data.text) price1.append(price) #租赁方式 method = re.findall(r'
  • 租赁方式:(.*?)
  • '
    , detail_data.text) method1.append(method) #房屋类型 leixing = re.findall(r'
  • 房屋类型:(.*?)
  • '
    , detail_data.text) leixing1.append(leixing) #面积 square = re.findall(r'
  • 面积:(.*?)
  • '
    , detail_data.text) square1.append(square) #朝向 chaoxiang = re.findall(r'
  • 朝向:(.*?)
  • '
    , detail_data.text) chaoxiang1.append(chaoxiang) #入住 ruzhushijian = re.findall(r'
  • 入住:(.*?)
  • '
    , detail_data.text) ruzhushijian1.append(ruzhushijian) #楼层 louceng = re.findall(r'
  • 楼层:(.*?)
  • '
    , detail_data.text) louceng1.append(louceng) #电梯 dianti = re.findall(r'
  • 电梯:(.*?)
  • '
    , detail_data.text) dianti1.append(dianti) #用水 yongshui = re.findall(r'
  • 用水:(.*?)
  • '
    , detail_data.text) yongshui1.append(yongshui) #用电 yongdian = re.findall(r'
  • 用电:(.*?)
  • '
    , detail_data.text) yongdian1.append(yongdian) #燃气 ranqi = re.findall(r'
  • 燃气:(.*?)
  • '
    , detail_data.text) ranqi1.append(ranqi) #采暖 cainuan = re.findall(r'
  • 采暖:(.*?)
  • '
    , detail_data.text) cainuan1.append(cainuan) #租期 zuqi = re.findall(r'
  • 租期:(.*?)
  • '
    , detail_data.text) zuqi1.append(zuqi) #代理人 agent = re.findall(r'name":"(.*?)","office', detail_data.text) agent1.append(agent) #代理人联系方式 phone = re.findall(r'phone400":"(.*?)","phone', detail_data.text) phone1.append(phone) print('page'+'-'+str(n)) try: for j in range(len(title1)): #print(title1[j][0]) sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'\ .format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0], price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0], louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0], cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0]) cursor.execute(sql) db.commit() print("已存储" + title1[j][0]) except Exception as e: print(e) ''' def savedata(): try: for j in range(len(title1)): # print(title1[j][0]) sql = 'insert into `chaoyang` (`房源标题`,`网址`,`市区`,`商圈`,`小区`,`租赁方式`,`价格`,`房屋类型`,`面积`,`朝向`,`入住`,`楼层`,`电梯`,`用水`,`用电`,`燃气`,`采暖`,`租期`,`代理人`,`联系方式`) values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")' \ .format(title1[j][0], url1[j], loca1[j], loca2[j], loca3[j], method1[j][0], price1[j][0], leixing1[j][0], square1[j][0], chaoxiang1[j], ruzhushijian1[j][0], louceng1[j][0], dianti1[j][0], yongshui1[j][0], yongdian1[j][0], ranqi1[j][0], cainuan1[j][0], zuqi1[j][0], agent1[j][0], phone1[j][0]) cursor.execute(sql) db.commit() print("已存储" + title1[j][0]) except Exception as e: print(e) ''' if __name__ == '__main__': for n in range(1,101): getdata(n)

    python爬虫——爬取链家网北京租房信息_第1张图片

    你可能感兴趣的:(数据库,python,列表)