多线程爬取马可波罗网供应商数据

本文旨在交流学习,勿作他用,否则后果自负
环境 linux+pycharm+anaconda

import json
import csv
import random
from queue import Queue
import threading
import requests
from usere_agent import UA
from lxml import etree


HEADER = {

    'User-Agent': UA,
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate',

}

def get_request(url):
    try:
        response = requests.get(
            url=url,
            headers=HEADER,
            verify=True,
            timeout=50
        )
        return response.text
    except Exception as e:
        pass


class Img(threading.Thread):
    def __init__(self, list_img):
        threading.Thread.__init__(self)
        self.list_img = list_img

    def run(self):
        while True:
            keys = self.list_img.get()#取key列表里面的元素
            self.Get_img(keys)
            self.list_img.task_done()#取不到元素的时候自动退出程序

    def Get_img(self, key):
        try:
            n_d = get_request(key)
            n_data = etree.HTML(n_d)
            good_url = n_data.xpath(
                r'.//div[@class="s_product_item"]//div[@class="s_product_pic_box"]/a[@target="_blank"]/@href')
            if good_url:
                for j in good_url:
                    good_detali = get_request(j)
                    goo_deta_data = etree.HTML(good_detali)
                    title_deta = goo_deta_data.xpath(r'.//div[@class="con_msg f1"]/div[@class="con_title"]/text()')
                    price = goo_deta_data.xpath(
                        r'.//div[@class="con_msg f1"]/div[@class="con_price"]/span[@class="price"]/text()')
                    company_name = goo_deta_data.xpath(
                        r'.//div[@class="con_msg f1"]//div[@class="con_item"]/ul/li[3]/a[@target="_blank"]/text()')
                    company_href = goo_deta_data.xpath(
                        r'.//div[@class="con_msg f1"]//div[@class="con_item"]/ul/li[3]/a[@target="_blank"]/@href')
                    if company_href:
                        # print(company_href[0])
                        company_deta = get_request(company_href[0])
                        company_deta_data = etree.HTML(company_deta)
                        contacts = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[1]/text()')
                        phone = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[2]/span[2]/text()')
                        address = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[3]/text()')
                        #print(ti)
                        with open('/media/liu/_dde_data/project/spider/供应商/mkbl_data/' + ti + '.csv', 'a+') as f:
                            f_csv = csv.writer(f)
                            f_csv.writerow([ti,title_deta[0], price[0], company_name[0], company_href[0], contacts[0], phone[0], address[0]])

                        print(ti, title_deta[0], price[0], company_name[0], company_href[0], contacts[0], phone[0],
                              address[0])
        except Exception as e:
                pass


if __name__ == '__main__':
    list_img =Queue()
    url='http://china.makepolo.com/list/d14/'
    d = get_request(url)
    data = etree.HTML(d)
    href = data.xpath(r'.//div[@class="category clearfix"]//dl//dd//a/@href')
    title = data.xpath(r'.//div[@class="category clearfix"]//dl//dd//a/text()')

    for ti, h in zip(title, href):
        for i in range(1, 101):
            n_h = h + '{}/'.format(str(i))
            list_img.put(n_h)

        for item in range(9):
            t = Img(list_img)
            t.setDaemon(True)
            t.start()
        list_img.join()

你可能感兴趣的:(爬虫)