import requests
from fake_useragent import UserAgent
import pandas as pd
import numpy as np
from urllib.parse import quote
import re
from time import sleep
from random import randint
import random
place_name = pd.read_excel('企业信息获取.xlsx')
hangshu = place_name.shape[0]
leishu = place_name.shape[1]
place_name
class Url_Mnger:
def Url_join(self,hangshu):
all_url = []
for i in range(hangshu):
village_name = place_name.iloc[i,1]
place_encode = quote(village_name)
url = 'http://api.map.baidu.com/geocoder?address={}'.format(place_encode)
print(village_name,url)
all_url.append(url)
return all_url
class Response_Cast(object):
def Get_response(self,url):
headers={
'User-Agent':UserAgent().chrome
}
response = requests.get(url=url,headers = headers)
return response.text
class Info_Manger:
def Parse_html(self,info_text):
latitude=re.findall(r'(.+)',info_text)
longitude = re.findall('(.+)',info_text)
latitude = latitude[0]
longitude = longitude[0]
print(latitude,longitude)
return latitude,longitude
def Make_dataform(self,lat,longi):
df = pd.DataFrame({'a_point':place_name.iloc[:,1],'a_lat':lat,'a_longi':longi})
return df
def Savedata(self,df):
df.to_csv('geo_data_gaode_quchong.csv',encoding='GBK')
class Run_Scrapy:
def __init__(self):
url_manger = Url_Mnger()
url_list = url_manger.Url_join(hangshu)
url_list_length = len(url_list)
response_cast = Response_Cast()
info_manger = Info_Manger()
lat = []
longi = []
for url,j in zip(url_list, range(url_list_length)):
print(j,'/',url_list_length)
sleep(random.uniform(1, 1.5))
if (j % 100) == 0 :
response_info = response_cast.Get_response(url)
info_latitude, info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
else:
response_info = response_cast.Get_response(url)
info_latitude,info_longitude = info_manger.Parse_html(response_info)
lat.append(info_latitude)
longi.append(info_longitude)
make_dataform = info_manger.Make_dataform(lat,longi)
info_manger.Savedata(make_dataform)
if __name__ == '__main__':
Run_Scrapy