1.在不登录的情况下只能浏览前十页,在登录的情况下只能爬取前一百页。(自行解决吧我也是没什么好办法)
2.搜狗微信的主要反爬措施是封 IP 和封 Cookie。
先看一张图片 这张图片上写到了IP(但是实际上他并不是ip出现问题了),一般来说这是cookies出现问题了再简单点就是snuid
在访问搜狗浏览器的视频分类时会返回cookie值
eg:
{"IPLOC": "CN1100", "SNUID": "792CA21E6366ED3196EDFE8A6396EF4D", "SUV": "00BF427F7CC14F1A5D36A351F020A401", "JSESSIONID": "aaay_uC7scjDFfXEFBDWw"}
直接取出snuid即可
另外访问搜狗微信的cookie的其他值可以直接从浏览器中复制(其余值有效期为一年)或者也可以自动生成
至此大的问题都已经解决了
代码
from functools import reduce
import phpserialize
import time
import requests
import json
import random
import logging
from logging.handlers import RotatingFileHandler
import os
from lxml import etree
from fake_useragent import UserAgent
from database.db_function import get_keyword, update_flag, get_proxy
from database.piplines import DataPipeline
from operate.function import hash
#
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
# 定义一个RotatingFileHandler,最多备份3个日志文件,每个日志文件最大1K
rHandler = RotatingFileHandler("sgwx_log.txt", maxBytes=10 * 1024, backupCount=5)
rHandler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
rHandler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
logger.addHandler(rHandler)
logger.addHandler(console)
yq = 5
ua = UserAgent().random
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID=F7A22C91EDE961A343DEFB22EDD4F97E; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw",
"Host": "weixin.sogou.com",
"Pragma": "no-cache",
"Referer": "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=2&ie=utf8",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
def detecting_keywords():
key_list = get_keyword(yq)
if len(key_list) == 0:
return []
all_key_list = sorted(key_list, key=lambda x: x[0], reverse=False)
data_list = []
for data in all_key_list:
keyword = data[1]
pid = data[2]
page = data[4]
if '"wx_data"' not in page:
update_flag(keyword, pid, yq)
continue
data_list.append([keyword, pid])
return data_list
def random_steep():
"""
防止封号,随机暂停
:return:
"""
a = random.randint(2, 8)
logger.info("暂停{}秒".format(a))
time.sleep(a)
def get_cookies():
"""
使用selenium获取cookies的值,将其存在文件中
:return:
"""
logger.info("获取微信snuid cookies")
url = 'https://v.sogou.com/v?ie=utf8&query=&p=40030600'
headers = {'User-Agent': ua}
proxies = {
"http": "http://" + get_proxy()
}
rst = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies)
cookies = rst.cookies.get_dict()
with open("wx_cookie.json", "w+")as f:
f.write(json.dumps(cookies))
f.close()
logger.info("wx_cookies获取成功")
def timestamp(dt):
"""
将时间转为时间戳
:param dt:
:return:
"""
timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
timestamp = time.mktime(timeArray)
return timestamp
def yesterday():
"""
获取昨天此时的时间戳
:return:
"""
timestamp = int(time.time())
yesterday = timestamp - 24 * 60 * 60
return yesterday
def get_req():
"""
打开cookies文件获取cookie,进行requests请求
:return:
"""
logger.info("打开cookie文件获取cookies")
with open("wx_cookie.json", "r")as f:
cookies = json.loads(f.readline())
snuid = cookies["SNUID"]
cookie = "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID={}; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw".format(
snuid)
header["Cookie"] = cookie
header["User-Agent"] = ua
req = requests.Session()
req.headers = header
return req
def cookies_expried():
"""
判断cookies是否过期,若过期会自动登录获取cookies
:return:
"""
file = os.path.isfile("wx_cookie.json")
if not file:
get_cookies()
req = get_req()
url = "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=3&ie=utf8"
response = req.get(url, allow_redirects=False)
logger.info("验证cookies是否可用")
login_state = True
if response.status_code == 302:
login_state = False
if login_state:
logger.info("cookies可用")
return req
else:
logger.info("cookies不可用,重新获取cookies")
get_cookies()
req = get_req()
return req
def into_database(items):
pipeline = DataPipeline()
pipeline.process_item(items=items)
def list_dict_duplicate_removal(data_list):
run_function = lambda x, y: x if y in x else x + [y]
return reduce(run_function, [[], ] + data_list)
def main():
while True:
keywords = detecting_keywords()
if keywords == []:
logger.info("本次无需采集信息 暂停1m")
time.sleep(60)
continue
for key in keywords:
keyword = key[0]
pid = key[1]
logger.info("当前关键词是:{}".format(keyword))
start_url = "https://weixin.sogou.com/weixin?usip=&query={}&ft=&tsn=1&et=&interation=&type=2&wxid=&page={}&ie=utf8"
req = cookies_expried()
for i in range(1, 11):
random_steep()
proxies = {
"http": "http://" + get_proxy()
}
response = req.get(url=start_url.format(keyword, i), allow_redirects=False, proxies=proxies)
if response.status_code == 302:
print("验证码 需要跟换ip")
rex = etree.HTML(response.text)
infos = rex.xpath('//ul[@class="news-list"]/li') # 信息列表
logger.info("本页信息条数:{}".format(len(infos)))
if len(infos) == 0:
update_flag(keyword, pid, yq)
break
rank = 0
save_data = []
for data in infos:
rank += 1
ranks = (int(i) - 1) * 10 + rank
link = data.xpath('string(./div[@class="txt-box"]/h3/a/@data-share)')
_title = data.xpath('string(./div[@class="txt-box"]/h3/a)')
summary = data.xpath('string(./div[@class="txt-box"]/p)')
screen_name = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/a)')
timestamps = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/@t)')
tup = (pid, keyword, timestamps, _title, 1, yq)
data = phpserialize.dumps(tup)
guid = hash(data)
item = dict()
item['mid'] = 0
item['guid'] = guid
item['short_url'] = link
item['rank_old'] = 0
item['yq'] = yq
item['nr'] = 1
item['sh'] = 0
item['qr'] = 0
item['cjtype'] = 1
item['update_time'] = int(time.time())
item['del_time'] = 0
item['status'] = 1
item['isnew_email'] = 0
item['ftype'] = 1
item['isnew_wx'] = 0
item['froms'] = 0
item['zf'] = 1
item['title'] = _title
item['url'] = link
item['screen_name'] = screen_name
item['content'] = summary
item['screen_time'] = timestamps
item['dzs'] = 0
item['zfs'] = 0
item['pls'] = 0
item['keyword'] = keyword
item['rank'] = ranks
item['page'] = i
item['pid'] = pid
item['screen_time'] = timestamps
item['is_white'] = 1
save_data.append(item)
save_datas = list_dict_duplicate_removal(save_data)
into_database(save_datas)
next_page = rex.xpath('//a[@id="sogou_next"]/@href')
if len(next_page) == 0:
logger.info("已完成 当前第{}页".format(i + 1))
update_flag(keyword, pid, yq)
break
if int(i) > 10:
logger.info("已完成 当前第{}页".format(i + 1))
update_flag(keyword, pid, yq)
break
logger.info("翻页 当前第{}页".format(i + 1))
if __name__ == '__main__':
main()