爬虫实战9:爬取1688网站商家信息

# coding:utf-8
import requests
import bs4
import time
import xlwt
import random


def get_IP():
    """获取代理IP
    """
    url = "http://www.xicidaili.com/nn/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',}
    session = requests.session()
    html = session.get(url, headers = headers).text
    table = bs4.BeautifulSoup(html, 'lxml')
    IP_lists = table.find('table', attrs={'id':'ip_list'}).find_all('tr')
    ip_list = []
    for IP_list in IP_lists[1:]:
        lists = IP_list.find_all('td')
        ip = {'ip': '', 'port': ''}
        if lists[5].text == 'HTTP':
            ip['ip'] = lists[1].text
            ip['port'] = lists[2].text
            ip_list.append(ip)
            print(ip_list)
    return ip_list

def  get_urls(url, page, ip):
    """获取查询商品的每家店的地址"""
    proxy = {'http': "http://" + ip['ip'] + ip['port'] }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'}
    html = requests.get(url=url + "&beginPage=" + str(page),headers=headers, proxy=proxy, timeout=10).text
    soup = bs4.BeautifulSoup(html, "lxml")
    tables = soup.find('div', attrs={'id': 'sw_mod_mainblock'})
    table = tables.find('ul').find_all('div', class_='list-item-left')
    urls = []
    for item in table:
        urls.append(table.find('a').get('href'))
    print(urls)
    url_1 = random.choice(urls)
    return url_1

def get_contact(url_1, ip):
    """ 获取每家店的联系方式 """
        proxy = { 'http': 'http://' + ip['ip'] + ip['port'] }
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate'}
        session = requests.session()
        try:
            html = session.get(url_1, headers=headers, proxy=proxy, timeout=10).text()
            contact_url = bs4.BeautifulSoup(html,'lxml').find('div', class_='top-nav-bar-box').find('li', attrs={'data-page-name': 'contactinfo'}).find('a').get('href')
        except BaseException:
            print('-----------------')
            return
        try:
            html = session.get(contact_url, headers=headers, proxy=proxy, timeout=10).text()
            table = bs4.BeautifulSoup(html, 'lxml').find('div', class_='fd-line').find_all('dl')
            title = bs4.BeautifulSoup(html, 'lxml').find('div', class_='contact-info').find('h4').get_text()
            info=[]
            for item in table[:-1]:
                info.append(item.get_text().replace('\n', '').replace(' ',''))
            return info
        except:
            print("~~~~~~~~~~~~~~~~~~~")

def main():
    url = "http://s.1688.com/company/company_search.htm?keywords=%BE%AB%C3%DC%BB%FA%D0%B5&earseDirect=false&button_click=top&n=y&pageSize=30"
    ip = get_IP()
    url_1 = get_urls(url, 1, ip)
    data = get_contact(url_1, ip)
    
if __name__ == "__main__":
    main()

你可能感兴趣的:(爬虫实战,爬虫实战)