#!/usr/bin/env/python3
# -*- coding:utf-8 -*-
'''
Author:leo
Date&Time:2021/03/26 18:10
Project:Python3 FileName:gaode_request.py
'''
# -*- coding: utf-8 -*-
import json, time, os, re, requests, random
from lxml import etree
from fake_useragent import UserAgent
class Gaode_requests(object):
def __init__(self):
self.logTime = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
self.curTime = time.strftime('%Y%m%d %H%M%S', time.localtime(time.time()))
self.nowdate_8, self.nowtime_6 = self.curTime.split(" ")[0], self.curTime.split(" ")[1]
self.proxies = {"http": None, "https": None}
self.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "guid=fd1b-ee7a-3f28-e167; UM_distinctid=177dbe27dd2261-090e01e25d14aa-5e1a3f18-100200-177dbe27dd35d9; cna=15DbF7eJLyYCAbfeFBk1OWNb; xlly_s=1; _uab_collina=161430179363728282983957; CNZZDATA1255626299=1850886754-1614296565-https%253A%252F%252Fwww.baidu.com%252F%7C1614301969; tfstk=ccKcB0qtcE7bjY_kRmsXLz8GvlcdZt2VGwQy43Eh62WhWMKPic4z8_q7G_RDXP1..; l=eBN5SFucjMSRj8hzBOfaourza779sIRYSuPzaNbMiOCP9TC65wVAW6gTj28BCnGVh6z6R3rMK82YBeYBqBAnnxv9sThLxkDmn; isg=BBcXPum9O1bm07_aExT_SeLopothXOu-04QrDWlEMeZNmDfacS_3DvKy-jiGdcM2",
"Host": "www.amap.com",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
def get_proxies(self):
# 绕过代理方法一
proxies = {"http": None, "https": None}
requests.get("http://ff2.pw", proxies=proxies)
response = requests.get("http://httpbin.org/ip", proxies=proxies)
print(response.text)
# 绕过代理方法二
session = requests.Session()
session.trust_env = False
response = session.get('http://ff2.pw')
print(response.text)
return proxies
def mkdir_file(self, path):
path = path.strip()
path = path.rstrip("\\")
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
# print(path+' 创建成功')
return True
else:
# print(path+' 目录已存在')
return False
def get_pic(self, pic_name, pic_url):
try:
res_pic = requests.get(url=pic_url, headers=self.headers, timeout=10, proxies=self.proxies) #二进制数据
# print('【成功】正在下载第图片,图片地址:' + str(pic_url))
self.mkdir_file("/爬虫\\images_gaode")
pic_dir = 'images_gaode/' + pic_name + '.jpg' #给每张图片指定路径并命名
fp = open(pic_dir, 'wb+') #给每张图片指定路径并命名
fp.write(res_pic.content) #将图片二进制数据写成图片
fp.close()
except requests.exceptions.ConnectionError:
print('【失败】当前图片无法下载,图片地址:' + str(pic_url))
def get_info(self, html, mode, x_xpath):
try:
str_result = x_xpath
if mode == "string":
res_value = html.xpath(str_result)[0].xpath('string(.)')
elif mode == "url":
res_value = html.xpath(str_result)[0]
elif mode == "list":
res_value = html.xpath(str_result)
elif mode == 'picture':
res_value = html.xpath(str_result)[0]
if isinstance(res_value, str):
res_value = res_value.encode('gbk', 'ignore').decode('gbk').strip().replace(" ", '').replace("\n", '')
elif isinstance(res_value, list):
res_value = " | ".join([l for l in res_value if "\n" not in l])
except Exception as e:
res_value = str(str_result) + "\t错误码:" + str(e)
return res_value
def get_html(self, url, log=False):
response = requests.get(url, headers=self.headers, proxies=self.proxies)
res_text = response.text.encode('gbk', 'ignore').decode('gbk')
html = etree.HTML(res_text, etree.HTMLParser())
if log == True:
print("Start crawling:" + url)
# print(res_text)
return html
# 获取信息
def get_catalog(self, html):
html_list = html.xpath('//*[@class="serp-list"]/li')
print("Statistic data:" + str(len(html_list)) + "条数据\n")
# 遍历列表
for i in range(1, len(html_list)):
# for i in range(1, 5+1):
res_pic_address = self.get_info(html=html, mode="url", x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-imgbox"]/span/@style')
res_store_name = self.get_info(html=html, mode="picture", x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/h3[@class="poi-title"]/span/@title')
res_store_star = self.get_info(html=html, mode="picture", x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/div[@class="poi-info"]/p/span/b/@style')
res_store_address = self.get_info(html=html, mode="string", x_xpath=f'//*[@class="serp-list"]/li[{i}]/div[@class="poi-info-left"]/div[@class="poi-info"]/p[2]')
# 格式化
res_pic_address = str(re.findall(r'''background-image: url\("([^"]+)"\);''', res_pic_address)[0])
res_store_name = str(res_store_name)
res_store_star = str(float(re.findall(r'''width:([^"]+)px''', res_store_star)[0])/13.0)
res_store_address = str(res_store_address)
self.get_pic(pic_name=str(i) + "_" + res_store_name, pic_url=res_pic_address)
print('店铺名称:' + res_store_name, end='\n------------列表信息------------\n')
print('店铺星级:' + res_store_star, end='\n')
print('店铺地址:' + res_store_address, end='\n')
print('图片地址:' + res_pic_address, end='\n')
def write_info(self, res_text):
with open('response_2021.txt', 'a+', encoding='utf-8') as write:
write.write(json.dumps(res_text, ensure_ascii=False) + '\n')
write.write('\n')
def run(self):
test = Gaode_requests()
html = test.get_html(url='https://www.amap.com/search?query=钱大妈&city=44060', log=True)
test.get_catalog(html=html)
class Gaode_requests_json(object):
def __init__(self):
self.headers = {
"Host": "map.amap.com",
'User-Agent': UserAgent().random
}
self.proxies = {"http": None, "https": None}
# self.headers = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
# "Cache-Control": "max-age=0",
# "Connection": "keep-alive",
# "Cookie": "guid=fd1b-ee7a-3f28-e167; UM_distinctid=177dbe27dd2261-090e01e25d14aa-5e1a3f18-100200-177dbe27dd35d9; cna=15DbF7eJLyYCAbfeFBk1OWNb; xlly_s=1; _uab_collina=161430179363728282983957; CNZZDATA1255626299=1850886754-1614296565-https%253A%252F%252Fwww.baidu.com%252F%7C1614301969; tfstk=ccKcB0qtcE7bjY_kRmsXLz8GvlcdZt2VGwQy43Eh62WhWMKPic4z8_q7G_RDXP1..; l=eBN5SFucjMSRj8hzBOfaourza779sIRYSuPzaNbMiOCP9TC65wVAW6gTj28BCnGVh6z6R3rMK82YBeYBqBAnnxv9sThLxkDmn; isg=BBcXPum9O1bm07_aExT_SeLopothXOu-04QrDWlEMeZNmDfacS_3DvKy-jiGdcM2",
# "Host": "www.amap.com",
# "Sec-Fetch-Dest": "document",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-User": "?1",
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
# }
self.data = {
"query_type": "TQUERY",
"pagesize": "20",
"pagenum": "1",
"qii": "true",
"cluster_state": "5",
"need_utd": "true",
"utd_sceneid": "1000",
"div": "PC1000",
"addr_poi_merge": "true",
"is_classify": "true",
"zoom": "9.16",
"city": "440600",
"geoobj": "103.868799|30.198854|105.050418|31.181088",
"keywords": "钱大妈"
}
def mkdir_file(self, path):
path = path.strip()
path = path.rstrip("\\")
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
# print(path+' 创建成功')
return True
else:
# print(path+' 目录已存在')
return False
def get_pic(self, pic_pash, pic_name, pic_url):
try:
res_pic = requests.get(url=pic_url, headers=self.headers, timeout=10, proxies=self.proxies) #二进制数据
# print('【成功】正在下载第图片,图片地址:' + str(pic_url))
# self.mkdir_file("D:\\Mytest\\Svnbucket\\Python3\\爬虫\\images_gaode")
self.mkdir_file(pic_pash)
pic_dir = pic_pash + "\\" + pic_name + '.jpg' #给每张图片指定路径并命名
fp = open(pic_dir, 'wb+') #给每张图片指定路径并命名
fp.write(res_pic.content) #将图片二进制数据写成图片
fp.close()
except requests.exceptions.ConnectionError:
print('【失败】当前图片无法下载,图片地址:' + str(pic_url))
return pic_name + ".jpg"
def get_res(self):
for page in range(1, 12+1):
try:
url = f"https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={page}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=10&city=440600&geoobj=113.012853|22.753595|113.672034|23.341201&keywords=钱大妈"
response = requests.get(url=url, timeout=5, headers=self.headers, proxies=self.proxies)
res_json = response.json()
time.sleep(1)
# print(type(res_json), len(res_json), res_json)
res_list = res_json["data"]["poi_list"]
# print(type(res_list), len(res_list), res_list, "\n")
print(f"----第{page}页:数据{len(res_list)}条----")
for i in range(len(res_list)):
try:
res_name = res_list[i]["disp_name"]
except:
res_name = "空"
try:
res_rating = res_list[i]["rating"]
except:
res_rating = "空"
try:
res_address = res_list[i]["address"]
except:
res_address = "空"
try:
res_pic_address = res_list[i]["domain_list"][5]["value"]
except:
res_pic_address = "空"
try:
pic_name = self.get_pic(pic_pash=f"D:\\Mytest\\Svnbucket\\Python3\\爬虫\\gaode_file\\images_gaode_page{page}", pic_name=str(int(i)+1) + "_" + res_name + "_" + res_address, pic_url=res_pic_address)
except:
res_pic_address = "空"
print(f'{i+1}.店铺名称:' + res_name, end='\n')
print('店铺星级:' + res_rating, end='\n')
print('店铺地址:' + res_address, end='\n')
print('图片名称:' + pic_name, end='\n')
print('图片地址:' + res_pic_address, end='\n\n')
except Exception as e:
print(e)
def run(self):
Gaode_requests_json().get_res()
if __name__ == "__main__":
Gaode_requests_json().run()
----第1页:数据20条----
1.店铺名称:钱大妈
店铺星级:4.6
店铺地址:海六路17号
图片名称:1_钱大妈_海六路17号.jpg
图片地址:http://store.is.autonavi.com/showpic/20105d428a05884bd27f3b08d8720b21
2.店铺名称:钱大妈
店铺星级:3.5
店铺地址:
图片名称:2_钱大妈_.jpg
图片地址:http://store.is.autonavi.com/showpic/19fbf6b3b480d6af29a63969dcf28de8
3.店铺名称:钱大妈
店铺星级:4.6
店铺地址:盐步穗盐东路花地湾雍景豪园帝景台25-30座61号商铺
图片名称:3_钱大妈_盐步穗盐东路花地湾雍景豪园帝景台25-30座61号商铺.jpg
图片地址:http://store.is.autonavi.com/showpic/5ba6be6a133c2ad85911559016651ef9
4.店铺名称:钱大妈
店铺星级:4.6
店铺地址:大沥镇建设大道中海金沙湾中区商业街B53号
图片名称:4_钱大妈_大沥镇建设大道中海金沙湾中区商业街B53号.jpg
图片地址:http://store.is.autonavi.com/showpic/a15e133eadb5fb4a5c5ba4a4ae484296
5.店铺名称:钱大妈
店铺星级:
店铺地址:荷富路与明国路交叉路口往东南约50米(美的西海岸)
图片名称:4_钱大妈_大沥镇建设大道中海金沙湾中区商业街B53号.jpg
图片地址:空
6.店铺名称:钱大妈
店铺星级:
店铺地址:文星路活力盈居地铺10号
图片名称:6_钱大妈_文星路活力盈居地铺10号.jpg
图片地址:http://store.is.autonavi.com/showpic/11f18279ee0be161718ebf6d38f7a2c8
7.店铺名称:钱大妈
店铺星级:3.5
店铺地址:佛山大道北东海国际3区2期达伦五金对面
图片名称:7_钱大妈_佛山大道北东海国际3区2期达伦五金对面.jpg
图片地址:http://store.is.autonavi.com/showpic/df8146ecbc6601c141df94492d6209cf
8.店铺名称:钱大妈
店铺星级:
店铺地址:
图片名称:8_钱大妈_.jpg
图片地址:http://store.is.autonavi.com/showpic/ce2ed6c27936805194b189fbbbad4a90&type=pic
9.店铺名称:钱大妈
店铺星级:
店铺地址:碧桂园翡翠湾东南(庆云大道北)
图片名称:9_钱大妈_碧桂园翡翠湾东南(庆云大道北).jpg
图片地址:http://store.is.autonavi.com/showpic/2806d087aa92d9f443c58acd7061d9c3
10.店铺名称:钱大妈
店铺星级:3.5
店铺地址:大沥镇盐步广佛路平地段89号
图片名称:10_钱大妈_大沥镇盐步广佛路平地段89号.jpg
图片地址:http://store.is.autonavi.com/showpic/38bc9fba23aeed54fd96c6c12f46568b