搜狗微信列表页数据采集之跳过验证码

搜狗微信采集重点在于以下几处

1.在不登录的情况下只能浏览前十页,在登录的情况下只能爬取前一百页。(自行解决吧我也是没什么好办法)

2.搜狗微信的主要反爬措施是封 IP 和封 Cookie。

搜狗微信列表页数据采集之跳过验证码_第1张图片

先看一张图片 这张图片上写到了IP(但是实际上他并不是ip出现问题了),一般来说这是cookies出现问题了再简单点就是snuid

获取snuid可以从搜狗浏览器的视频入手
搜狗微信列表页数据采集之跳过验证码_第2张图片

在访问搜狗浏览器的视频分类时会返回cookie值

eg:

{"IPLOC": "CN1100", "SNUID": "792CA21E6366ED3196EDFE8A6396EF4D", "SUV": "00BF427F7CC14F1A5D36A351F020A401", "JSESSIONID": "aaay_uC7scjDFfXEFBDWw"}

直接取出snuid即可 

另外访问搜狗微信的cookie的其他值可以直接从浏览器中复制(其余值有效期为一年)或者也可以自动生成 

至此大的问题都已经解决了

代码
 

from functools import reduce
import phpserialize
import time
import requests
import json
import random
import logging
from logging.handlers import RotatingFileHandler
import os
from lxml import etree
from fake_useragent import UserAgent
from database.db_function import get_keyword, update_flag, get_proxy
from database.piplines import DataPipeline
from operate.function import hash

#
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
# 定义一个RotatingFileHandler,最多备份3个日志文件,每个日志文件最大1K
rHandler = RotatingFileHandler("sgwx_log.txt", maxBytes=10 * 1024, backupCount=5)
rHandler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
rHandler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
logger.addHandler(rHandler)
logger.addHandler(console)

yq = 5
ua = UserAgent().random
header = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Cookie": "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID=F7A22C91EDE961A343DEFB22EDD4F97E; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw",
    "Host": "weixin.sogou.com",
    "Pragma": "no-cache",
    "Referer": "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=2&ie=utf8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}


def detecting_keywords():
    key_list = get_keyword(yq)
    if len(key_list) == 0:
        return []
    all_key_list = sorted(key_list, key=lambda x: x[0], reverse=False)
    data_list = []
    for data in all_key_list:
        keyword = data[1]
        pid = data[2]
        page = data[4]
        if '"wx_data"' not in page:
            update_flag(keyword, pid, yq)
            continue
        data_list.append([keyword, pid])
    return data_list


def random_steep():
    """
    防止封号,随机暂停
    :return:
    """
    a = random.randint(2, 8)
    logger.info("暂停{}秒".format(a))
    time.sleep(a)


def get_cookies():
    """
    使用selenium获取cookies的值,将其存在文件中
    :return:
    """
    logger.info("获取微信snuid cookies")
    url = 'https://v.sogou.com/v?ie=utf8&query=&p=40030600'
    headers = {'User-Agent': ua}
    proxies = {
        "http": "http://" + get_proxy()
    }
    rst = requests.get(url=url, headers=headers, allow_redirects=False, proxies=proxies)
    cookies = rst.cookies.get_dict()
    with open("wx_cookie.json", "w+")as f:
        f.write(json.dumps(cookies))
        f.close()
    logger.info("wx_cookies获取成功")


def timestamp(dt):
    """
    将时间转为时间戳
    :param dt:
    :return:
    """
    timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")
    timestamp = time.mktime(timeArray)
    return timestamp


def yesterday():
    """
    获取昨天此时的时间戳
    :return:
    """
    timestamp = int(time.time())
    yesterday = timestamp - 24 * 60 * 60
    return yesterday


def get_req():
    """
    打开cookies文件获取cookie,进行requests请求
    :return:
    """
    logger.info("打开cookie文件获取cookies")
    with open("wx_cookie.json", "r")as f:
        cookies = json.loads(f.readline())
    snuid = cookies["SNUID"]
    cookie = "ABTEST=2|1561358499|v1; IPLOC=CN1100; SUID=1A4FC17C4631990A000000005D1070A3; SUV=00CDEE0D7CC14F1A5D1070A35DFF7862; SUID=1A4FC17C2113940A000000005D1070A3; weixinIndexVisited=1; pgv_pvi=2997798912; ld=tZllllllll2N9tTHlllllV1S8k7lllllKGV5pklllltlllllpylll5@@@@@@@@@@; LSTMV=682%2C416; LCLKINT=8742; SNUID={}; sct=14; JSESSIONID=aaaoA1hhCs7Mf8ji7bsTw".format(
        snuid)
    header["Cookie"] = cookie
    header["User-Agent"] = ua
    req = requests.Session()
    req.headers = header
    return req


def cookies_expried():
    """
    判断cookies是否过期,若过期会自动登录获取cookies
    :return:
    """
    file = os.path.isfile("wx_cookie.json")
    if not file:
        get_cookies()
    req = get_req()
    url = "https://weixin.sogou.com/weixin?usip=&query=%E9%98%BF%E5%B0%94%E6%B3%95%E7%8B%97&ft=&tsn=1&et=&interation=&type=2&wxid=&page=3&ie=utf8"
    response = req.get(url, allow_redirects=False)
    logger.info("验证cookies是否可用")
    login_state = True
    if response.status_code == 302:
        login_state = False
    if login_state:
        logger.info("cookies可用")
        return req
    else:
        logger.info("cookies不可用,重新获取cookies")
        get_cookies()
        req = get_req()
        return req


def into_database(items):
    pipeline = DataPipeline()
    pipeline.process_item(items=items)


def list_dict_duplicate_removal(data_list):
    run_function = lambda x, y: x if y in x else x + [y]
    return reduce(run_function, [[], ] + data_list)


def main():
    while True:
        keywords = detecting_keywords()
        if keywords == []:
            logger.info("本次无需采集信息 暂停1m")
            time.sleep(60)
            continue
        for key in keywords:
            keyword = key[0]
            pid = key[1]
            logger.info("当前关键词是:{}".format(keyword))
            start_url = "https://weixin.sogou.com/weixin?usip=&query={}&ft=&tsn=1&et=&interation=&type=2&wxid=&page={}&ie=utf8"
            req = cookies_expried()
            for i in range(1, 11):
                random_steep()
                proxies = {
                    "http": "http://" + get_proxy()
                }
                response = req.get(url=start_url.format(keyword, i), allow_redirects=False, proxies=proxies)
                if response.status_code == 302:
                    print("验证码 需要跟换ip")
                rex = etree.HTML(response.text)
                infos = rex.xpath('//ul[@class="news-list"]/li')  # 信息列表
                logger.info("本页信息条数:{}".format(len(infos)))
                if len(infos) == 0:
                    update_flag(keyword, pid, yq)
                    break
                rank = 0
                save_data = []
                for data in infos:
                    rank += 1
                    ranks = (int(i) - 1) * 10 + rank
                    link = data.xpath('string(./div[@class="txt-box"]/h3/a/@data-share)')
                    _title = data.xpath('string(./div[@class="txt-box"]/h3/a)')
                    summary = data.xpath('string(./div[@class="txt-box"]/p)')
                    screen_name = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/a)')
                    timestamps = data.xpath('string(./div[@class="txt-box"]/div[@class="s-p"]/@t)')
                    tup = (pid, keyword, timestamps, _title, 1, yq)
                    data = phpserialize.dumps(tup)
                    guid = hash(data)
                    item = dict()
                    item['mid'] = 0
                    item['guid'] = guid
                    item['short_url'] = link
                    item['rank_old'] = 0
                    item['yq'] = yq
                    item['nr'] = 1
                    item['sh'] = 0
                    item['qr'] = 0
                    item['cjtype'] = 1
                    item['update_time'] = int(time.time())
                    item['del_time'] = 0
                    item['status'] = 1
                    item['isnew_email'] = 0
                    item['ftype'] = 1
                    item['isnew_wx'] = 0
                    item['froms'] = 0
                    item['zf'] = 1
                    item['title'] = _title
                    item['url'] = link
                    item['screen_name'] = screen_name
                    item['content'] = summary
                    item['screen_time'] = timestamps
                    item['dzs'] = 0
                    item['zfs'] = 0
                    item['pls'] = 0
                    item['keyword'] = keyword
                    item['rank'] = ranks
                    item['page'] = i
                    item['pid'] = pid
                    item['screen_time'] = timestamps
                    item['is_white'] = 1
                    save_data.append(item)
                save_datas = list_dict_duplicate_removal(save_data)
                into_database(save_datas)
                next_page = rex.xpath('//a[@id="sogou_next"]/@href')
                if len(next_page) == 0:
                    logger.info("已完成 当前第{}页".format(i + 1))
                    update_flag(keyword, pid, yq)
                    break
                if int(i) > 10:
                    logger.info("已完成 当前第{}页".format(i + 1))
                    update_flag(keyword, pid, yq)
                    break
                logger.info("翻页 当前第{}页".format(i + 1))


if __name__ == '__main__':
    main()

 

你可能感兴趣的:(爬虫)