估计也爬不完 数据比较少 有个方法应该能拿几十页 自己想滑块 我也没得办法 pyppeteer 也没法 没辙了
# -*- coding: utf-8 -*-
import scrapy
import json
import re
import requests
import re
import time
import random
from pl.settings import User_Agents
from bson.objectid import ObjectId
import pymongo
client = pymongo.MongoClient(host='localhost',port=27017)
db = client.proxies_pool
collection = db.taobao
db = client.mylove1
"""
protocol: 代理ip支持的协议类型,http 0 https 1 https和http 都支持是2
nick_type:代理ip的匿名程度 高匿名:0 匿名:1 透明:2
"""
class PplSpider(scrapy.Spider):
name = 'ppl'
# allowed_domains = ['www.com']
start_urls = ['https://www.taobao.com/']
def start_requests(self):
page = 0
page = page*44
page_num={}
page_num["page"] = page
url_mode='https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s='+str(page)
cookies = ''
# cookies='thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; UM_distinctid=1703cb6d7be32d-0e3b11cf905c1f-34555273-144000-1703cb6d7bf172; enc=Ao1IiAwGM8RE4yzNSG%2FYhb0Kgn28voJwNWKbtehvqH8sQFEmsjsfYicZLEg6o9tFRb4qMIoDI39iwy9WaZ4kbA%3D%3D; _m_h5_tk=ece571f60ecb26cfe6f90fc58977c176_1582994487261; _m_h5_tk_enc=c8f90925d365a927395d6aa1eaec694d; t=db0526d878c7e5a2d5bb67df8a83e3c7; cna=pHHyFY4K0DECAXAs6oDPW13o; lgc=tb403473694; tracknick=tb403473694; tg=0; mt=ci=1_1; v=0; cookie2=1fe1eeb6fcd457b217068cab16579d7b; _tb_token_=3bf5600353e86; _samesite_flag_=true; sgcookie=DMo2xhVGqw9Hc8tJ1Zlsp; unb=3602273622; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNaHcdRO%2F8Eqbg%3D%3D&vt3=F8dBxd33Ccb36eOhdS4%3D&nk2=F5RBxfUsC8zSN58%3D; csg=5126fa67; cookie17=UNaHcdRO%2F8Eqbg%3D%3D; dnk=tb403473694; skt=514b655ed438e6f5; existShop=MTU4MzA2MDIzNg%3D%3D; uc4=id4=0%40UgGO7J7zce13jOZzgScaYCbK%2BQud&nk4=0%40FY4KoN69jv0V93baScdV5i0BmdZAdw%3D%3D; _cc_=VT5L2FSpdA%3D%3D; _l_g_=Ug%3D%3D; sg=42c; _nk_=tb403473694; cookie1=AiGca0Q82E%2B%2BFtbZzZ59IBNSNoOkruDIfCfC7dFmGeo%3D; tfstk=c9NNBPsIu1CNN_6UEbGqcp8ojmaOZ4omVBurI8xFHEEDJ2kGiLxxRpMI8qJnzAf..; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&paszslIQhLQq3BsBOCN5A-d5P_9jIRA_u8w91cBi_5B56Lsg27OoRAF3Fp6cjWfGj8B4KYQmP29-etujPy06Pt-g3fPNxDc.; isg=BMbGrHdV5IqZxLBtJspWDEFsF7xIJwrhckVuHbDvsenEs2bNGLU-8CTJi-9_HAL5'
cookies={i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
header = {
'User-Agent': random.choice(User_Agents),
'referer':' https://www.taobao.com/'
}
yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})
def parse(self, response):
last_page = -3 #方便爬取所有的评论
page = response.meta['page']
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" + str(page))
response = response.text
goods_match = re.findall(r'g_page_config = ({.*?});', response)
goods_items = json.loads(goods_match[0])
goods_items = goods_items['mods']['itemlist']['data']['auctions']
for goods_item in goods_items:
dic = {}
goods = {
'title': goods_item['raw_title'],
'price': goods_item['view_price'],
'location': goods_item['item_loc'],
'sales': goods_item['view_sales'] # 获取不了 需要调接口
}
print(goods)
# db.mylove1.insert(goods)# 等待插入!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
itemId = goods_item["nid"]
user_id = goods_item["user_id"]
print(itemId)
print(user_id)
time.sleep(10)
# for i in range(1,2):
# all_ip_list = []
# for item in collection.aggregate([{'$sample': {'size': 12}}]): # 随机抽取一个
# if item['protocol'] >= 1:
# ip = item['_id'] + ":" + item['port']
# all_ip_list.append(ip)
# ip = random.choice(all_ip_list)
# proxies = {
# 'https': 'https://' + ip, # 处理https连接的
# }
time.sleep(random.randint(6,7))
url = "https://rate.tmall.com/list_detail_rate.htm"
header = {
"cookie":"",
"referer": "https://detail.tmall.com/item.htm",
'User-Agent':random.choice(User_Agents)
}
for i in range(1,100):
params = { # 必带信息
"itemId": itemId, # 商品id
"sellerId": user_id,
"currentPage": str(i), # 页码
}
# req = requests.get(url, params, headers=header,proxies=proxies, verify=False).text
if i-1 == last_page:
break
else:
comment = requests.get(url, params, headers=header).text
print(comment)
comment = re.findall(r'({.*?}})', comment)
comment = json.loads(comment[0])
comment_list = comment["rateDetail"]["rateList"]
last_page = comment["rateDetail"]['paginator']['lastPage']
for each_comment in comment_list:
dic={}
if each_comment['rateContent'] :# 去掉空的内容
comment_content = each_comment['rateContent']
dic['comment_content'] = comment_content
i +=1 #页数加一页
# db.mylove1.insert(dic)# 每次都插入 直到爬到封号为止
# yield TbItem(
# good_inform=dic['good_inform'],
# comment_time=dic['comment_time'],
# comment_content=dic['comment_content']
# )
page +=1
page = page * 44
page_num = {}
page_num["page"] = page
url_mode = 'https://s.taobao.com/search?q=%E6%B8%B8%E6%88%8F&s=' + str(page)
cookies = ''
# cookies='thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; UM_distinctid=1703cb6d7be32d-0e3b11cf905c1f-34555273-144000-1703cb6d7bf172; enc=Ao1IiAwGM8RE4yzNSG%2FYhb0Kgn28voJwNWKbtehvqH8sQFEmsjsfYicZLEg6o9tFRb4qMIoDI39iwy9WaZ4kbA%3D%3D; _m_h5_tk=ece571f60ecb26cfe6f90fc58977c176_1582994487261; _m_h5_tk_enc=c8f90925d365a927395d6aa1eaec694d; t=db0526d878c7e5a2d5buc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNaHcdRO%2F8Eqbg%3D%3D&vt3=F8dBxd33Ccb36eOhdS4%3D&nk2=F5RBxfUsC8zSN58%3D; csg=5126fa67; GO7J7zce13jOZzgS2B%2BFtbZzZ59IBNSNoOkruDIfCfC7dFmGeo%3D; tfstk=c9NNBPsIu1CNN_6UEbGqcp8ojmaOZ4omVBurI8xFHEEDJ2kGiLxxRpMI8qJnzAf..; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&existShop=false&pas=0&cookie14=UoTUOa5upVR6LA%3D%3D&tag=8&lng=zh_CN; JSESSIONID=BE8A1BCB684F187141D8F1AB50FADE55; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; l=dBaAzslIQhLQq3BsBOCN5A-d5P_9jIRA_u8w91cBi_5B56Lsg27OoRAF3Fp6cjWfGj8B4KYQmP29-etujPy06Pt-g3fPNxDc.; isg=BMbGrHdV5IqZxLBtJspWDEFsF7xIJwrhckVuHbDvsenEs2bNGLU-8CTJi-9_HAL5'
cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
header = {
'User-Agent': random.choice(User_Agents),
'referer': ' https://www.taobao.com/'
}
yield scrapy.Request(url_mode , callback=self.parse, headers=header, cookies=cookies, meta={'page':page_num})