import requests
from lxml import etree
from bs4 import BeautifulSoup
import csv
from multiprocessing import Pool
import pandas as pd
import time
def get_html(page):
try:
tunnel_host = "tps136.kdlapi.com"
tunnel_port = "15818"
# 隧道id和密码
tid = "t17811077831686"
password = "jq0bpw4i"
proxies = {
"http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
"https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}
headers = {
'authority': 'search.jd.com',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'referer': 'https://search.jd.com/Search?keyword=%E6%83%A0%E6%99%AE%E7%94%B5%E8%84%91&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&wq=%E6%83%A0%E6%99%AE%E7%94%B5%E8%84%91&ev=exbrand_%E6%83%A0%E6%99%AE%EF%BC%88HP%EF%BC%89%5E&page=3&s=61&click=0',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
'cookie': 'shshshfpa=3dadc96e-af0d-4743-e8ce-1fb14a2d3285-1574337056; __jdu=15743370557421608408348; shshshfpb=rRtlBHtNe1FeDgYsijgXwfg%3D%3D; __jdv=76161171|direct|-|none|-|1578279028389; areaId=1; xtest=4393.cf6b6759; __jda=122270672.15743370557421608408348.1574337056.1574337056.1578279028.2; __jdc=122270672; qrsc=3; rkv=V0000; 3AB9D23F7A4B3C9B=XNZ2XZVVD2AXISW2R4PYVE34QAFFOOLGAEUVM6GQ4WOJGHCW5HHMOTHQQQH3RMSCJVMJN4GDSDTIXUVZAYT7S5W2LY; __jdb=122270672.3.15743370557421608408348|2.1578279028; shshshfp=12265fd5da11a64ec5dbbbb7dea9bd22; shshshsID=0a54aff692feb14ebc59cd4e914e3e92_3_1578279542800; ipLoc-djd=1-2800-2848-0',
}
params = (
('keyword', '\u60E0\u666E\u7535\u8111'),
('enc', 'utf-8'),
('qrst', '1'),
('rt', '1'),
('stop', '1'),
('vt', '2'),
('bs', '1'),
('wq', '\u60E0\u666E\u7535\u8111'),
('ev', 'exbrand_\u60E0\u666E\uFF08HP\uFF09^'),
('page', page),
('s', '87'),
('scrolling', 'y'),
('log_id', '1578279731.89470'),
('tpl', '1_M'),
)
response = requests.get('https://search.jd.com/s_new.php', headers=headers, params=params)
html=response.text
return html
except Exception as e:
print(e)
def get_urls(html):
try:
urls = []
html = BeautifulSoup(html,'lxml')
for k in html.find_all('a'):
urls.append(k['href'])
ccc = pd.DataFrame(data=urls)
ccc.to_csv('jingdongurls.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
except Exception as e:
print(e)
def main():
count=0
pages=[i for i in range(0,100)]
for page in pages:
count+=1
print(count)
html=get_html(page)
get_urls(html)
time.sleep(2)
if __name__ == '__main__':
main()
import requests
import json
import jsonpath
from lxml import etree
import csv
import time
import itertools
from fake_useragent import UserAgent
import random
import pandas as pd
from multiprocessing import Pool, Lock, Manager
import datetime
requests.packages.urllib3.disable_warnings()
import re
def get_html(url):
i = 0
while i <= 2:
try:
f_urls=[]
'''tunnel_host = "tps136.kdlapi.com"
tunnel_port = "15818"
# 隧道id和密码
tid = "t17811077831686"
password = "jq0bpw4i"
proxies = {
"http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
"https": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
}'''
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = 'H7G9A46LZ0QMU12D'
proxyPass = '78D1D2D37DE56698'
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
headers = {
'Connection': 'keep-alive',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
}
response = requests.get('https://item.jd.com/'+url+'.html', headers=headers, proxies=proxies, verify=False)
html = response.text
return html
except Exception as e:
i += 1
print('访问出错{}次'.format(i))
f_urls.append(url)
ccc = pd.DataFrame(data=f_urls)
ccc.to_csv('furls.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
time.sleep(random.uniform(0, 1))
def get_infos(html,url):
try:
infos=[]
xpaths = etree.HTML(html)
mingcheng = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/text()')
pinpai = xpaths.xpath('//*[@id="parameter-brand"]/li/a/text()')
# pinglun=xpaths.xpath('//*[@id="comment-count"]/a/text()')[0]
jiage = xpaths.xpath('//span[@class="pricing"]/del/text()')
maozhong = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[3]/text()')
chandi = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[4]/text()')
huohao = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[5]/text()')
bianhao = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[2]/text()')
xitong = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[6]/text()')
yingpanrongliang = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[8]/text()')
chuliqi = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[10]/text()')
neicun = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[17]/text()')
pingmu = xpaths.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[16]/text()')
data = {
'链接':'https://item.jd.com/'+url+'.html',
'名称': mingcheng,
'品牌': pinpai,
# '评论数':pinglun,
'价格': jiage,
'毛重': maozhong,
'产地': chandi,
'货号': huohao,
'编号': bianhao,
'系统': xitong,
'硬盘容量': yingpanrongliang,
'处理器': chuliqi,
'内存': neicun,
'屏幕尺寸':pingmu
}
infos.append(data)
info = infos[-1:]
time.sleep(random.uniform(0, 0.5))
return info
except Exception as e:
print(e)
time.sleep(random.uniform(0, 1))
def main(url,urls):
print(urls.index(url))
html = get_html(url)
info = get_infos(html, url)
return info
def infos_to_csv(info):
ccc = pd.DataFrame.from_dict(info)
ccc.to_csv('jingdonginfos.csv', mode='a', encoding='utf-8-sig', header=False, index=False)
if __name__ == '__main__':
i = 0
while i <= 1:
try:
print(i)
e1 = datetime.datetime.now()
f1 = open('C:\\Users\\joshua\\Desktop\\python\\京东\\jingdongurls.csv', 'r', encoding='utf-8-sig')
#f2 = open('C:\\Users\\joshua\\Desktop\\python\\dongguan\\东莞1.csv', 'r', encoding='utf-8-sig')
#f3 = open('C:\\Users\\joshua\\Desktop\\python\\dongguan\\东莞不存在的链接1.csv', 'r', encoding='utf-8-sig')
csvreader1 = csv.reader(f1)
#csvreader2 = csv.reader(f2)
#csvreader3 = csv.reader(f3)
columns1 = [column[0] for column in csvreader1]
#columns2 = [column[2] for column in csvreader2]
#columns3 = [column[0] for column in csvreader3]
urls = []
columns1_qc = list(set(columns1))
print(len(columns1_qc))
columns1_qc.sort(key=columns1.index)
for column in columns1_qc:
#print(column)
url=re.findall('//item.jd.com/(.*).html',column,re.S)
if len(url)==0:
pass
else:
urls.append(url[0])
p = Pool(4)
# for url in urls1[0:1]:
for url in urls:
p.apply_async(main, (url, urls), callback=infos_to_csv)
p.close()
p.join()
e2 = datetime.datetime.now()
print((e2 - e1))
i += 1
except Exception as e:
i += 0.5