代码

import requests
from lxml import etree
from bs4 import BeautifulSoup
import csv
from multiprocessing import Pool
import pandas as pd
import time
def get_html(page):
    try:
        tunnel_host = "tps136.kdlapi.com"
        tunnel_port = "15818"

        # 隧道id和密码
        tid = "t17811077831686"
        password = "jq0bpw4i"

        proxies = {
            "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
            "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
                }
        headers = {
            'authority': 'search.jd.com',
            'accept': '*/*',
            'x-requested-with': 'XMLHttpRequest',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'cors',
            'referer': 'https://search.jd.com/Search?keyword=%E6%83%A0%E6%99%AE%E7%94%B5%E8%84%91&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq=%E6%83%A0%E6%99%AE%E7%94%B5%E8%84%91&ev=exbrand_%E6%83%A0%E6%99%AE%EF%BC%88HP%EF%BC%89%5E&page=3&s=61&click=0',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
            'cookie': 'shshshfpa=3dadc96e-af0d-4743-e8ce-1fb14a2d3285-1574337056; __jdu=15743370557421608408348; shshshfpb=rRtlBHtNe1FeDgYsijgXwfg%3D%3D; __jdv=76161171|direct|-|none|-|1578279028389; areaId=1; xtest=4393.cf6b6759; __jda=122270672.15743370557421608408348.1574337056.1574337056.1578279028.2; __jdc=122270672; qrsc=3; rkv=V0000; 3AB9D23F7A4B3C9B=XNZ2XZVVD2AXISW2R4PYVE34QAFFOOLGAEUVM6GQ4WOJGHCW5HHMOTHQQQH3RMSCJVMJN4GDSDTIXUVZAYT7S5W2LY; __jdb=122270672.3.15743370557421608408348|2.1578279028; shshshfp=12265fd5da11a64ec5dbbbb7dea9bd22; shshshsID=0a54aff692feb14ebc59cd4e914e3e92_3_1578279542800; ipLoc-djd=1-2800-2848-0',
        }

        params = (
            ('keyword', '\u60E0\u666E\u7535\u8111'),
            ('enc', 'utf-8'),
            ('qrst', '1'),
            ('rt', '1'),
            ('stop', '1'),
            ('vt', '2'),
            ('bs', '1'),
            ('wq', '\u60E0\u666E\u7535\u8111'),
            ('ev', 'exbrand_\u60E0\u666E\uFF08HP\uFF09^'),
            ('page', page),
            ('s', '87'),
            ('scrolling', 'y'),
            ('log_id', '1578279731.89470'),
            ('tpl', '1_M'),
        )

        response = requests.get('https://search.jd.com/s_new.php', headers=headers, params=params)
        html=response.text
        return html
    except Exception as e:
        print(e)
def get_urls(html):
    try:
        urls = []
        html = BeautifulSoup(html,'lxml')
        for k in html.find_all('a'):
            urls.append(k['href'])
        ccc = pd.DataFrame(data=urls)
        ccc.to_csv('jingdongurls.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
    except Exception as e:
        print(e)
def main():
    count=0
    pages=[i for i in range(0,100)]
    for page in pages:
        count+=1
        print(count)
        html=get_html(page)
        get_urls(html)
        time.sleep(2)
if __name__ == '__main__':
    main()

import requests
import json
import jsonpath
from lxml import etree
import csv
import time
import itertools
from fake_useragent import UserAgent
import random
import pandas as pd
from multiprocessing import Pool, Lock, Manager
import datetime
requests.packages.urllib3.disable_warnings()
import re

def get_html(url):
    i = 0
    while i <= 2:
        try:
            f_urls=[]
            '''tunnel_host = "tps136.kdlapi.com"
            tunnel_port = "15818"

            # 隧道id和密码
            tid = "t17811077831686"
            password = "jq0bpw4i"

            proxies = {
                "http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
                "https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
            }'''
            proxyHost = "http-dyn.abuyun.com"
            proxyPort = "9020"

            # 代理隧道验证信息
            proxyUser = 'H7G9A46LZ0QMU12D'
            proxyPass = '78D1D2D37DE56698'

            proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
                "host": proxyHost,
                "port": proxyPort,
                "user": proxyUser,
                "pass": proxyPass,
            }

            proxies = {
                "http": proxyMeta,
                "https": proxyMeta,
            }
            headers = {
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'X-Requested-With': 'XMLHttpRequest',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
                'Sec-Fetch-Site': 'same-origin',
                'Sec-Fetch-Mode': 'cors',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
            }
            response = requests.get('https://item.jd.com/'+url+'.html', headers=headers, proxies=proxies, verify=False)
            html = response.text
            return html
        except Exception as e:
            i += 1
            print('访问出错{}次'.format(i))
            f_urls.append(url)
            ccc = pd.DataFrame(data=f_urls)
            ccc.to_csv('furls.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
            time.sleep(random.uniform(0, 1))
def get_infos(html,url):
    try:
        infos=[]
        xpaths = etree.HTML(html)
        mingcheng = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/text()')
            pinpai = xpaths.xpath('//*[@id="parameter-brand"]/li/a/text()')
            # pinglun=xpaths.xpath('//*[@id="comment-count"]/a/text()')[0]
            jiage = xpaths.xpath('//span[@class="pricing"]/del/text()')
            maozhong = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[3]/text()')
            chandi = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[4]/text()')
            huohao = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[5]/text()')
            bianhao = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[2]/text()')
            xitong = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[6]/text()')
            yingpanrongliang = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[8]/text()')
            chuliqi = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[10]/text()')
            neicun = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[17]/text()')
            pingmu = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[16]/text()')
            data = {
                '链接':'https://item.jd.com/'+url+'.html',
                '名称': mingcheng,
                '品牌': pinpai,
                # '评论数':pinglun,
                '价格': jiage,
                '毛重': maozhong,
                '产地': chandi,
                '货号': huohao,
                '编号': bianhao,
                '系统': xitong,
                '硬盘容量': yingpanrongliang,
                '处理器': chuliqi,
                '内存': neicun,
                '屏幕尺寸':pingmu
            }
            infos.append(data)
            info = infos[-1:]
            time.sleep(random.uniform(0, 0.5))
            return info
    except Exception as e:
        print(e)
        time.sleep(random.uniform(0, 1))
def main(url,urls):
    print(urls.index(url))
    html = get_html(url)
    info = get_infos(html, url)
    return info
def infos_to_csv(info):
    ccc = pd.DataFrame.from_dict(info)
    ccc.to_csv('jingdonginfos.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
if __name__ == '__main__':
    i = 0
    while i <= 1:
        try:
            print(i)
            e1 = datetime.datetime.now()
            f1 = open('C:\\Users\\joshua\\Desktop\\python\\京东\\jingdongurls.csv', 'r', encoding='utf-8-sig')
            #f2 = open('C:\\Users\\joshua\\Desktop\\python\\dongguan\\东莞1.csv', 'r', encoding='utf-8-sig')
            #f3 = open('C:\\Users\\joshua\\Desktop\\python\\dongguan\\东莞不存在的链接1.csv', 'r', encoding='utf-8-sig')
            csvreader1 = csv.reader(f1)
            #csvreader2 = csv.reader(f2)
            #csvreader3 = csv.reader(f3)
            columns1 = [column[0] for column in csvreader1]
            #columns2 = [column[2] for column in csvreader2]
            #columns3 = [column[0] for column in csvreader3]
            urls = []
            columns1_qc = list(set(columns1))
            print(len(columns1_qc))
            columns1_qc.sort(key=columns1.index)
            for column in columns1_qc:
                #print(column)
                url=re.findall('//item.jd.com/(.*).html',column,re.S)
                if len(url)==0:
                    pass
                else:
                    urls.append(url[0])
            p = Pool(4)
            # for url in urls1[0:1]:
            for url in urls:
                p.apply_async(main, (url, urls), callback=infos_to_csv)  
            p.close()
            p.join()
            e2 = datetime.datetime.now()
            print((e2 - e1))
            i += 1
        except Exception as e:
            i += 0.5


你可能感兴趣的:(代码)