爬虫爬取链家房源数据并保存本地excel或txt

import requests,re
from openpyxl.workbook import Workbook
from openpyxl.writer.excel import ExcelWriter


def get_page(url):
    headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    response = requests.get(url,headers=headers)
    return response.text


def parse_page(html):
    pattern = re.compile(
        "price\">(.*?)(.*?)
.*?data-housecode=\"(.*?)\".*?data-el=.*?>(.*?)
(.*?)/(.*?)/(.*?)/(.*?)/(.*?)
.*?(.*?)<.*?(.*?)<.*?(.*?)<.*?", re.S ) items = re.findall(pattern,html) return items def parse_all(): property_data = [] for i in range(1,101): url = "https://sh.lianjia.com/ershoufang/pg{0}/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x".format(i) html = get_page(url) item_list = parse_page(html) property_data.extend(item_list) return property_data def save_property_list(): property_list = parse_all() wb = Workbook() ws = wb.worksheets[0] ws.title = "链家房源-上海-202002" ws.cell(row=1, column=1).value = "价格" ws.cell(row=1, column=2).value = "房源" ws.cell(row=1, column=3).value = "描述" ws.cell(row=1, column=4).value = "位置" ws.cell(row=1, column=5).value = "房型" ws.cell(row=1, column=6).value = "面积" ws.cell(row=1, column=7).value = "朝向" ws.cell(row=1, column=8).value = "装修" ws.cell(row=1, column=9).value = "注一" ws.cell(row=1, column=10).value = "注二" ws.cell(row=1, column=11).value = "注三" for i in range(len(property_list)): data_list = property_list[i] for j in range(1,len(data_list)): val = data_list[j] if j == 1: val = data_list[j - 1] + data_list[j] else: val = data_list[j] ws.cell(row=i+2, column=j).value = val wb.save("上海二月份链家房源.xlsx") def main(): save_property_list() if __name__ == "__main__": main()

``

import requests,re
#结果保存为文本txt格式



def get_page(url):
    headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    response = requests.get(url,headers=headers)
    return response.text


def parse_page(html):
    pattern = re.compile(
        "price\">(.*?)(.*?)
.*?data-housecode=\"(.*?)\".*?data-el=.*?>(.*?)
(.*?)/(.*?)/(.*?)/(.*?)/(.*?)
.*?(.*?)<.*?(.*?)<.*?(.*?)<.*?", re.S ) items = re.findall(pattern,html) return items def parse_all(): property_data = [] for i in range(1,101): url = "https://sh.lianjia.com/ershoufang/pg{0}/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x".format(i) html = get_page(url) item_list = parse_page(html) property_data.extend(item_list) return property_data def save_property_list(): property_list = parse_all() for i in range(0,len(property_list)): data_list = property_list[i] file = open("上海二月份链家房源.txt","a",encoding="utf-8") file.write("\n".join([data_list[0] + data_list[1],data_list[2],data_list[3],data_list[4],data_list[5],data_list[6],data_list[7],data_list[8],data_list[9],data_list[10],data_list[11]])) file.write("\n" + "=" * 50 + "\n") file.close() def main(): save_property_list() if __name__ == "__main__": main()

你可能感兴趣的:(笔记)