start_url = 'http://www.mafengwo.cn/mdd/'
proxies = get_proxies_requests()
random_header = get_header()
add_header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.mafengwo.cn',
'Upgrade-Insecure-Requests': '1',
}
last_header = dict()
last_header.update(random_header)
last_header.update(add_header)
html = requests.get(url, headers=last_header, proxies=proxies, timeout=10)
content_list = list()
html = etree.HTML(html.content)
# 大洲列表
continent_lists = html.xpath("//div[@class='bd']/dl[@class='item']")
for continent in continent_lists:
continent_name = continent.xpath("./dt[@class='sub-title']/text()")[0]
# 大洲下面的国家
country_lists = continent.xpath(".//dd[@class='clearfix']//a")
for country in country_lists:
country_dict = dict()
country_chinese_name = country.xpath("./text()")[0]
country_english_name = country.xpath("./span/text()")[0]
country_part_url = country.xpath("./@href")[0]
country_url = 'http://www.mafengwo.cn' + country_part_url
country_id = country_part_url.split('.')[0].split('/')[-1]
country_start_popular_cities_url = part_popular_cities_url.format(country_id)
country_dict['continent'] = continent_name
country_dict['country_chinese_name'] = country_chinese_name
country_dict['country_english_name'] = country_english_name
country_dict['country_url'] = country_url
country_dict['country_id'] = country_id
country_dict['country_start_popular_cities_url'] = country_start_popular_cities_url
content_list.append(country_dict)
抓取了201个国家。
马蜂窝数据接口:
pagedata_destinationlist_url = 'http://www.mafengwo.cn/mdd/base/list/pagedata_citylist'
self.pagedata_destinationlist_url = pagedata_destinationlist_url
last_lists = list()
file = open('./results/mafengwo_country_info.json', 'r', encoding='utf-8')
movie_lists = file.readlines()
movie_lists = list(set(movie_lists))
last_lists.extend(movie_lists)
last_lists = list(set(last_lists))
print('文件共' + str(len(last_lists)) + '个对象......')
i = 0
# print()
# crawling_data = {"country_english_name": "China", "country_start_popular_cities_url": "http://www.mafengwo.cn/mdd/citylist/21536.html", "country_url": "http://www.mafengwo.cn/travel-scenic-spot/mafengwo/21536.html", "continent": "亚洲", "country_chinese_name": "中国 ", "country_id": "21536"}
# print(type(crawling_data))
# exit(0)
# self.url_queue.put((crawling_data, i))
for crawling_data in last_lists[:]:
crawling_data = json.loads(crawling_data)
self.url_queue.put((crawling_data, i))
i += 1
country_id = crawling_data['country_id']
html_is_kong = 0
page_num = 0
for page in range(1, self.total_page):
page_num += 1
data = {
"mddid": country_id,
"page": str(page),
}
try:
html = self._get_url_content_post(data)
except:
html = ''
flag = 0
if html is not '':
source_code_html = html.content.decode()
dic = json.loads(source_code_html)
try:
page_info = dic['page']
except:
page_info = ''
if '后一页' not in page_info and '末页' not in page_info:
flag = 1
else:
html_is_kong += 1
if html_is_kong == 30:
break
# print('flag = {}'.format(flag))
self.html_queue.put((html, i, crawling_data, flag, page_num))
if flag:
# 推出循环条件
break
proxies = get_proxies_requests()
random_header = get_header()
add_header = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Content-Length': '18',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'PHPSESSID=012jlosl7nd28cdlbtarggbop6; mfw_uuid=5bd7d7e9-e53d-89ac-2889-f0d77d529a39; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A3546%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222018-10-30+12%3A02%3A49%22%3B%7D; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1540872169%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1540872169%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5bd7d7e9-e53d-89ac-2889-f0d77d529a39; UM_distinctid=166c323745f60-084bd0cc3f686d-39614807-1fa400-166c32374604cf; ifu_class_num=1; ad_widget_footer_20181101preview_0_other=1%2C1vuh0fz; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A18%3A%22www.baidu.com%2Flink%22%3Bs%3A1%3A%22t%22%3Bi%3A1541044035%3B%7D; __mfwlv=1541063796; __mfwvn=5; CNZZDATA30065558=cnzz_eid%3D598255974-1540867004-%26ntime%3D1541063301; __mfwlt=1541067393',
'Host': 'www.mafengwo.cn',
'Origin': 'http://www.mafengwo.cn',
# 'Referer': 'http://www.mafengwo.cn/mdd/citylist/21536.html',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
last_header = dict()
last_header.update(random_header)
last_header.update(add_header)
sleep(random.uniform(2, 5))
html = requests.post(self.pagedata_destinationlist_url, headers=last_header, proxies=proxies, timeout=10, data=data)
html = html.content.decode()
html_dict = json.loads(html)
html_content = html_dict['list']
html_content = etree.HTML(html_content)
try:
li_lists = html_content.xpath("//li[@class='item ']")
except:
li_lists = list()
# 获取城市信息
last_lists = list()
for destination_info in li_lists:
destination_dict = dict()
destination_chinese_name = ''.join(destination_info.xpath(".//div[@class='title']/text()")).strip()
destination_english_name = destination_info.xpath("./div[@class='img']//div[@class='title']/p/text()")[0] if len(destination_info.xpath("./div[@class='img']//div[@class='title']/p/text()")) > 0 else ''
destination_id = destination_info.xpath("./div[@class='img']/a/@data-id")[0]
destination_url = destination_info.xpath("./div[@class='img']/a/@href")[0]
destination_url = 'http://www.mafengwo.cn' + destination_url
destination_big_head_img_url = destination_info.xpath("./div[@class='img']/a/img/@data-original")[0]
if '?' in destination_big_head_img_url:
destination_big_head_img_url = destination_big_head_img_url.split('?')[0]
destination_num_people_have_been = destination_info.xpath(".//div[@class='nums']/b/text()")[0]
destination_summary = destination_info.xpath(".//div[@class='detail']/text()")[0].strip()
destination_top_lists = destination_info.xpath(".//dl[@class='caption']/dd/a")
if len(destination_top_lists) == 3:
top_one_con = dict()
top_two_con = dict()
top_three_con = dict()
# 第一
top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
# print(top_one_con)
# 第二
top_two_con['attractions_name'] = destination_top_lists[1].xpath("./@title")[0]
top_two_con['attractions_id'] = destination_top_lists[1].xpath("./@data-id")[0]
top_two_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[1].xpath("./@href")[0]
# 第三
top_three_con['attractions_name'] = destination_top_lists[2].xpath("./@title")[0]
top_three_con['attractions_id'] = destination_top_lists[2].xpath("./@data-id")[0]
top_three_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[2].xpath("./@href")[0]
top = {
'top_1': top_one_con,
'top_2': top_two_con,
'top_3': top_three_con,
}
elif len(destination_top_lists) == 2:
top_one_con = dict()
top_two_con = dict()
top_three_con = dict()
# 第一
top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
# print(top_one_con)
# 第二
top_two_con['attractions_name'] = destination_top_lists[1].xpath("./@title")[0]
top_two_con['attractions_id'] = destination_top_lists[1].xpath("./@data-id")[0]
top_two_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[1].xpath("./@href")[0]
# 第三
top_three_con['attractions_name'] = ''
top_three_con['attractions_id'] = ''
top_three_con['attractions_url'] = ''
top = {
'top_1': top_one_con,
'top_2': top_two_con,
'top_3': top_three_con,
}
elif len(destination_top_lists) == 1:
top_one_con = dict()
top_two_con = dict()
top_three_con = dict()
# 第一
top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
# print(top_one_con)
# 第二
top_two_con['attractions_name'] = ''
top_two_con['attractions_id'] = ''
top_two_con['attractions_url'] = ''
# 第三
top_three_con['attractions_name'] = ''
top_three_con['attractions_id'] = ''
top_three_con['attractions_url'] = ''
top = {
'top_1': top_one_con,
'top_2': top_two_con,
'top_3': top_three_con,
}
else:
top_one_con = dict()
top_two_con = dict()
top_three_con = dict()
# 第一
top_one_con['attractions_name'] = ''
top_one_con['attractions_id'] = ''
top_one_con['attractions_url'] = ''
# print(top_one_con)
# 第二
top_two_con['attractions_name'] = ''
top_two_con['attractions_id'] = ''
top_two_con['attractions_url'] = ''
# 第三
top_three_con['attractions_name'] = ''
top_three_con['attractions_id'] = ''
top_three_con['attractions_url'] = ''
top = {
'top_1': top_one_con,
'top_2': top_two_con,
'top_3': top_three_con,
}
destination_top = top
destination_dict['destination_chinese_name'] = destination_chinese_name
destination_dict['destination_english_name'] = destination_english_name
destination_dict['destination_id'] = destination_id
destination_dict['destination_url'] = destination_url
destination_dict['destination_big_head_img_url'] = destination_big_head_img_url
destination_dict['destination_num_people_have_been'] = destination_num_people_have_been
destination_dict['destination_summary'] = destination_summary
destination_dict['destination_top'] = destination_top
destination_dict.update(crawling_data)
last_lists.append(destination_dict)
try:
if len(last_lists) > 0:
last_lists = json.dumps(last_lists)
last_lists = json.loads(last_lists)
# print()
print('第 {} 个国家, 第 {} 页, 共有 {} 条数据...,国家城市链接: {}'.format(i, page_num, len(last_lists), last_lists[0]['country_start_popular_cities_url']))
# print('第 {} 个国家, 第 {} 页, 共有 {} 条数据...,国家城市链接: '.format(i, page_num, len(last_lists)))
self.content_list_queue.put((last_lists, i))
抓取了一万多目的地。。。