Python-马蜂窝全站抓取。。。今天先写抓取国家和城市,下次有时间在写景点、自由行攻略和游记。。。

1、抓取全部国家

1)开始的url

start_url = 'http://www.mafengwo.cn/mdd/'

2)请求部分代码

        proxies = get_proxies_requests()
        random_header = get_header()
        add_header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.mafengwo.cn',
            'Upgrade-Insecure-Requests': '1',
        }
        last_header = dict()
        last_header.update(random_header)
        last_header.update(add_header)
        
        html = requests.get(url, headers=last_header, proxies=proxies, timeout=10)

3)提取数据代码

                content_list = list()
                html = etree.HTML(html.content)
                # 大洲列表
                continent_lists = html.xpath("//div[@class='bd']/dl[@class='item']")
                for continent in continent_lists:
                    continent_name = continent.xpath("./dt[@class='sub-title']/text()")[0]
                    # 大洲下面的国家
                    country_lists = continent.xpath(".//dd[@class='clearfix']//a")
                    for country in country_lists:
                        country_dict = dict()
                        country_chinese_name = country.xpath("./text()")[0]
                        country_english_name = country.xpath("./span/text()")[0]
                        country_part_url = country.xpath("./@href")[0]
                        country_url = 'http://www.mafengwo.cn' + country_part_url
                        country_id = country_part_url.split('.')[0].split('/')[-1]
                        country_start_popular_cities_url = part_popular_cities_url.format(country_id)
                        country_dict['continent'] = continent_name
                        country_dict['country_chinese_name'] = country_chinese_name
                        country_dict['country_english_name'] = country_english_name
                        country_dict['country_url'] = country_url
                        country_dict['country_id'] = country_id
                        country_dict['country_start_popular_cities_url'] = country_start_popular_cities_url

                        content_list.append(country_dict)

4)结果

抓取了201个国家。

2、抓取全部的目的地

马蜂窝数据接口:

pagedata_destinationlist_url = 'http://www.mafengwo.cn/mdd/base/list/pagedata_citylist'

self.pagedata_destinationlist_url = pagedata_destinationlist_url

1)遍历国家id,获取全部的目的地

        last_lists = list()
        file = open('./results/mafengwo_country_info.json', 'r', encoding='utf-8')
        movie_lists = file.readlines()
        movie_lists = list(set(movie_lists))
        last_lists.extend(movie_lists)

        last_lists = list(set(last_lists))
        print('文件共' + str(len(last_lists)) + '个对象......')
        i = 0

        # print()
        # crawling_data = {"country_english_name": "China", "country_start_popular_cities_url": "http://www.mafengwo.cn/mdd/citylist/21536.html", "country_url": "http://www.mafengwo.cn/travel-scenic-spot/mafengwo/21536.html", "continent": "亚洲", "country_chinese_name": "中国 ", "country_id": "21536"}
        # print(type(crawling_data))
        # exit(0)
        # self.url_queue.put((crawling_data, i))

        for crawling_data in last_lists[:]:
            crawling_data = json.loads(crawling_data)
            self.url_queue.put((crawling_data, i))
            i += 1

2)发送请求部分


            country_id = crawling_data['country_id']
            html_is_kong = 0
            page_num = 0
            for page in range(1, self.total_page):
                page_num += 1

                data = {
                        "mddid": country_id,
                        "page": str(page),
                        }
                try:
                    html = self._get_url_content_post(data)
                except:
                    html = ''
                flag = 0
                if html is not '':
                    source_code_html = html.content.decode()
                    dic = json.loads(source_code_html)

                    try:
                        page_info = dic['page']
                    except:
                        page_info = ''
                    if '后一页' not in page_info and '末页' not in page_info:
                        flag = 1
                else:
                    html_is_kong += 1

                    if html_is_kong == 30:
                        break
                # print('flag = {}'.format(flag))
                self.html_queue.put((html, i, crawling_data, flag, page_num))
                if flag:
                    # 推出循环条件
                    break

 

3)请求体部分,post请求

        proxies = get_proxies_requests()
        random_header = get_header()
        add_header = {

            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Connection': 'keep-alive',
            'Content-Length': '18',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            # 'Cookie': 'PHPSESSID=012jlosl7nd28cdlbtarggbop6; mfw_uuid=5bd7d7e9-e53d-89ac-2889-f0d77d529a39; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A3546%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222018-10-30+12%3A02%3A49%22%3B%7D; uva=s%3A78%3A%22a%3A3%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1540872169%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1540872169%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5bd7d7e9-e53d-89ac-2889-f0d77d529a39; UM_distinctid=166c323745f60-084bd0cc3f686d-39614807-1fa400-166c32374604cf; ifu_class_num=1; ad_widget_footer_20181101preview_0_other=1%2C1vuh0fz; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A18%3A%22www.baidu.com%2Flink%22%3Bs%3A1%3A%22t%22%3Bi%3A1541044035%3B%7D; __mfwlv=1541063796; __mfwvn=5; CNZZDATA30065558=cnzz_eid%3D598255974-1540867004-%26ntime%3D1541063301; __mfwlt=1541067393',
            'Host': 'www.mafengwo.cn',
            'Origin': 'http://www.mafengwo.cn',
            # 'Referer': 'http://www.mafengwo.cn/mdd/citylist/21536.html',
            # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',

        }
        last_header = dict()
        last_header.update(random_header)
        last_header.update(add_header)
        sleep(random.uniform(2, 5))
        html = requests.post(self.pagedata_destinationlist_url, headers=last_header, proxies=proxies, timeout=10, data=data)

4)数据提取

                html = html.content.decode()
                html_dict = json.loads(html)
                html_content = html_dict['list']
                html_content = etree.HTML(html_content)
                try:
                    li_lists = html_content.xpath("//li[@class='item ']")
                except:
                    li_lists = list()
                # 获取城市信息
                last_lists = list()
                for destination_info in li_lists:
                    destination_dict = dict()
                    destination_chinese_name = ''.join(destination_info.xpath(".//div[@class='title']/text()")).strip()
                    destination_english_name = destination_info.xpath("./div[@class='img']//div[@class='title']/p/text()")[0] if len(destination_info.xpath("./div[@class='img']//div[@class='title']/p/text()")) > 0 else ''
                    destination_id = destination_info.xpath("./div[@class='img']/a/@data-id")[0]
                    destination_url = destination_info.xpath("./div[@class='img']/a/@href")[0]
                    destination_url = 'http://www.mafengwo.cn' + destination_url
                    destination_big_head_img_url = destination_info.xpath("./div[@class='img']/a/img/@data-original")[0]
                    if '?' in destination_big_head_img_url:
                        destination_big_head_img_url = destination_big_head_img_url.split('?')[0]
                    destination_num_people_have_been = destination_info.xpath(".//div[@class='nums']/b/text()")[0]
                    destination_summary = destination_info.xpath(".//div[@class='detail']/text()")[0].strip()
                    destination_top_lists = destination_info.xpath(".//dl[@class='caption']/dd/a")
                    if len(destination_top_lists) == 3:
                        top_one_con = dict()
                        top_two_con = dict()
                        top_three_con = dict()
                        # 第一
                        top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
                        top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
                        top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
                        # print(top_one_con)
                        # 第二
                        top_two_con['attractions_name'] = destination_top_lists[1].xpath("./@title")[0]
                        top_two_con['attractions_id'] = destination_top_lists[1].xpath("./@data-id")[0]
                        top_two_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[1].xpath("./@href")[0]
                        # 第三
                        top_three_con['attractions_name'] = destination_top_lists[2].xpath("./@title")[0]
                        top_three_con['attractions_id'] = destination_top_lists[2].xpath("./@data-id")[0]
                        top_three_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[2].xpath("./@href")[0]

                        top = {
                            'top_1': top_one_con,
                            'top_2': top_two_con,
                            'top_3': top_three_con,
                        }
                    elif len(destination_top_lists) == 2:
                        top_one_con = dict()
                        top_two_con = dict()
                        top_three_con = dict()
                        # 第一
                        top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
                        top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
                        top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
                        # print(top_one_con)
                        # 第二
                        top_two_con['attractions_name'] = destination_top_lists[1].xpath("./@title")[0]
                        top_two_con['attractions_id'] = destination_top_lists[1].xpath("./@data-id")[0]
                        top_two_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[1].xpath("./@href")[0]
                        # 第三
                        top_three_con['attractions_name'] = ''
                        top_three_con['attractions_id'] = ''
                        top_three_con['attractions_url'] = ''

                        top = {
                            'top_1': top_one_con,
                            'top_2': top_two_con,
                            'top_3': top_three_con,
                        }
                    elif len(destination_top_lists) == 1:
                        top_one_con = dict()
                        top_two_con = dict()
                        top_three_con = dict()
                        # 第一
                        top_one_con['attractions_name'] = destination_top_lists[0].xpath("./@title")[0]
                        top_one_con['attractions_id'] = destination_top_lists[0].xpath("./@data-id")[0]
                        top_one_con['attractions_url'] = 'http://www.mafengwo.cn' + destination_top_lists[0].xpath("./@href")[0]
                        # print(top_one_con)
                        # 第二
                        top_two_con['attractions_name'] = ''
                        top_two_con['attractions_id'] = ''
                        top_two_con['attractions_url'] = ''
                        # 第三
                        top_three_con['attractions_name'] = ''
                        top_three_con['attractions_id'] = ''
                        top_three_con['attractions_url'] = ''

                        top = {
                            'top_1': top_one_con,
                            'top_2': top_two_con,
                            'top_3': top_three_con,
                        }
                    else:
                        top_one_con = dict()
                        top_two_con = dict()
                        top_three_con = dict()
                        # 第一
                        top_one_con['attractions_name'] = ''
                        top_one_con['attractions_id'] = ''
                        top_one_con['attractions_url'] = ''
                        # print(top_one_con)
                        # 第二
                        top_two_con['attractions_name'] = ''
                        top_two_con['attractions_id'] = ''
                        top_two_con['attractions_url'] = ''
                        # 第三
                        top_three_con['attractions_name'] = ''
                        top_three_con['attractions_id'] = ''
                        top_three_con['attractions_url'] = ''

                        top = {
                            'top_1': top_one_con,
                            'top_2': top_two_con,
                            'top_3': top_three_con,
                        }
                    destination_top = top
                    destination_dict['destination_chinese_name'] = destination_chinese_name
                    destination_dict['destination_english_name'] = destination_english_name
                    destination_dict['destination_id'] = destination_id
                    destination_dict['destination_url'] = destination_url
                    destination_dict['destination_big_head_img_url'] = destination_big_head_img_url
                    destination_dict['destination_num_people_have_been'] = destination_num_people_have_been
                    destination_dict['destination_summary'] = destination_summary
                    destination_dict['destination_top'] = destination_top
                    destination_dict.update(crawling_data)
                    last_lists.append(destination_dict)
                try:
                    if len(last_lists) > 0:
                        last_lists = json.dumps(last_lists)
                        last_lists = json.loads(last_lists)
                        # print()
                        print('第 {} 个国家, 第 {} 页, 共有 {} 条数据...,国家城市链接:  {}'.format(i, page_num, len(last_lists), last_lists[0]['country_start_popular_cities_url']))
                        # print('第 {} 个国家, 第 {} 页, 共有 {} 条数据...,国家城市链接: '.format(i, page_num, len(last_lists)))

                        self.content_list_queue.put((last_lists, i))

5)结果

抓取了一万多目的地。。。

 

 

 

 

 

 

 

 

 

 

 

你可能感兴趣的:(爬虫-页面解析)