正则表达式爬虫——上海小区大全

import requests

import re

import csv

import time

import random

def get_area_names(url,line_num):

    headers = {

        'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',

        'cookie':'aQQ_ajkguid=EA9803C9-984A-8523-9851-4A030C3F192C; ctid=11; wmda_uuid=6c700ade52714ad2458ecf83c0e7724e; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; wmda_session_id_6289197098934=1559007118314-d3bbb6df-b03b-4667; sessid=DBF5B260-CD64-5A54-ED48-33FC0D0A3D50; lps=http%3A%2F%2Fwww.anjuke.com%2Fshanghai%2Fcm1210%2F%7C; twe=2; __xsptplusUT_8=1; propertys=s5vy32-ps6xxj_; ajk_member_captcha=b5e4bebf3df2a3edce16d50621e6a514; __xsptplus8=8.2.1559007119.1559007564.4%234%7C%7C%7C%7C%7C%23%23DDSca8CHAo0T4Q1PY0Yv9GoBvpZVNqz2%23; _ga=GA1.2.654788080.1559007564; _gid=GA1.2.721651369.1559007564'

    }

    res = requests.get(url,headers=headers)

    # result = re.findall('(.*?)', res.text, re.S)

    result = re.findall('(.*?)', res.text, re.S)

    # 黄山始信苑

    print(result)

    for row in result:

        print(row[1],row[0])

        try:

            res1=requests.get(row[0], headers=headers , timeout=5 )

            result1 = re.findall('_spread_params="commbook_p" href="(.*?)" class="hd-link only_show" target="_blank"',res1.text, re.S)

        except Exception as err:

            #requests.exceptions.ConnectTimeout  as err: #requests.exceptions.ReadTimeout

            print('查询失败1:',err)

            time.sleep( random.randint(1, 3))

            result1=('http://127.0.0.1','')

        time.sleep(random.randint(1, 5))

        #print(res1.text)

        #print(res1.text)

        #print (result1[0])

        if len(result1) >= 1 :

            try:

                res2 = requests.get(result1[0], headers=headers)

                result2=re.findall('lat : "(.*?)",.*?lng : "(.*?)"',res2.text,re.S)

            except Exception as err:

                print('查询失败2:', err)

                result2=(('',''),)

            #print(result2)

            time.sleep( random.randint(2, 5))

            if len(result2)>=1 :

            # print(row,result2[0])

                row_to_write = list(row + result2[0])

            else:

                row_to_write = list(row)

        else:

            row_to_write = list(row)

        #print(type(row_to_write),row_to_write)

        line_num = line_num + 1

        print(line_num,row_to_write)

        writer.writerow(row_to_write)

    return(line_num)

if __name__ == '__main__':

    line_num = 0

    f = open('C:\上海小区大全.csv', 'w', encoding='utf-8',newline='')

    writer = csv.writer(f,dialect='excel')

    writer.writerow(['页面链接', '小区名称', '百度纬度', '百度经度'])

    urls = ['https://www.anjuke.com/shanghai/cm/p{}'.format(str(i)) for i in range(1,36)]

    for url in urls:

        print(url)

        line_num = get_area_names(url,line_num)

        time.sleep(random.randint(1, 3))



你可能感兴趣的:(正则表达式爬虫——上海小区大全)