python 爬取搜狗微信关键词

# -*- coding: utf-8 -*-
import random
import requests
from pyquery import PyQuery as pq
from urllib.parse import urlencode, quote
import uuid
import time
import re
import time
import hashlib
from utils.img_to_tencent import img_to_tencent

def md5(str):
    return hashlib.md5(str.encode('utf-8')).hexdigest()


PC_UAS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'
]

headers = {
        'User-Agent': random.sample(PC_UAS, 1)[0],
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Referer': 'https://www.baidu.com/',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Connection': 'close',
    }

def get_k_h(url):
    b = int(random.random() * 100) + 1
    a = url.find("url=")
    url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
    return url


a_str = '''
uigs_cl	first_click
uigs_refer	https://weixin.sogou.com/
uigs_productid	vs_web
terminal	web
vstype	weixin
pagetype	result
channel	result_article
s_from	input
sourceid	
type	weixin_search_pc
uigs_cookie	SUID,sct
weixintype	2
exp_status	-1
exp_id_list	0_0
wuid	0071440178DB40975D3C689EE37C6784
rn	1
login	0
uphint	1
bottomhint	1
page	1
exp_id	null_0-null_1-null_2-null_3-null_4-null_5-null_6-null_7-null_8-null_9
time	20914
'''


def str_to_dict(a_str):
    '''
    将a_str形式的字符串转化为字典形式;
    :param a_str:
    :return:
    '''
    str_a = list(i for i in a_str.split('\n') if i != '')
    str_b = {}
    for a in str_a:
        a1 = a.split('\t')[0]
        a2 = a.split('\t')[1]
        str_b[a1] = a2

    return str_b


b_data = str_to_dict(a_str)


def get_suva(sunid):
    '''
    根据sunid来获取suv参数;并添加到cookie
    :param a: sunid
    :return:
    '''
    b_data['snuid'] = sunid.split('=')[-1]
    b_data['uuid'] = uuid.uuid1()
    b_data['uigs_t'] = str(int(round(time.time() * 1000)))
    url_link = 'https://pb.sogou.com/pv.gif?' + urlencode(b_data)
    res = requests.get(url_link)
    cookie_s = res.headers['Set-Cookie'].split(',')
    cookie_list_s = []
    for i in cookie_s:
        for j in i.split(','):
            if 'SUV' in j:
                cookie_list_s.append(j)
            else:
                continue
    # print(cookie_list_s[0].split(';')[0])
    headers['Cookie'] = cookie_list_s[0].split(';')[0]



def crawl_baidu(word):
    b_data.update({'query':word})
    pc_headers = {
        'User-Agent': random.sample(PC_UAS, 1)[0],
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cache-Control': 'no-cache',
        'Pragma': 'no-cache',
        'Referer': 'https://weixin.sogou.com/',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Host': 'weixin.sogou.com',
        "Accept - Language": "zh - CN, zh;q = 0.9",

    }
    dat = []
    for i in range(1,3):
        html_text = ''
        resp = ''
        for j in range(3):
            url = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query=%s&_sug_type_=&s_from=input&_sug_=n&type=2&page=%s&ie=utf8' % (
            word,int(i))
            try:
                proxies = {'http': 'h'}
                resp = requests.get(url,headers=pc_headers,timeout=25,proxies=proxies)
                html_text = resp.content
            except Exception as e:
                print(e)
                continue
            break
        if html_text:
            cookies = resp.headers['Set-Cookie'].split(';')
            cookie_list_long = []
            cookie_list2 = []
            doc = pq(html_text)
            divs = doc('.txt-box').items()
            for j in divs:
                url_list11 = pq(html_text)('.news-list li').items()
                img_list = []
                for i in url_list11:
                    # 提取href属性标签
                    try:
                        url_list12 = pq(i('.img-box img').attr('src'))
                        if not url_list12:
                            data_imgs = ''
                        else:
                            url_list12 = str(url_list12).replace('

', '').replace('

', '').replace('amp;', '') data_imgs = str(url_list12).replace('//img01.sogoucdn.com/net/a/04/link?appid=100520033&url=', '') img_list.append(data_imgs) except: pass data_title = j('h3 a').text() data_content = j('.txt-info').text() show_info = j('.s-p').text() show_info = str(show_info).replace("document.write(timeConvert('",',').replace("'))",'') author_name = show_info.split(',')[0] data_showtime = show_info.split(',')[1] for cookie in cookies: cookie_list_long.append(str(cookie).split(',')) for i in cookie_list_long: for se in i: if 'SUID' in se or 'SNUID' in se: cookie_list2.append(se) sunid = cookie_list2[0].split(';')[0] get_suva(sunid) # 构造动态Cookies headers['Cookie'] = headers['Cookie'] + ';' + ';'.join(cookie_list2) target_url = j('h3 a').attr('href') b = int(random.random() * 100) + 1 a = target_url.find("url=") result_link = target_url + "&k=" + str(b) + "&h=" + target_url[a + 4 + 21 + b: a + 4 + 21 + b + 1] a_url = "https://weixin.sogou.com" + result_link second_url = '' for i in range(3): try: second_url = requests.get(a_url, headers=headers,proxies=proxies,timeout=20).text except: continue break # 获取真实url url_text = re.findall("\'(\S+?)\';", second_url, re.S) best_url = ''.join(url_text) best_url = str(best_url).replace('&from=inner', '') author_imgs = '' try: data_imgs = img_list[len(dat)] except: data_imgs = '' # img_to_tencent(str(data_imgs)) if 'http://mp' in best_url: dat.append([word]) mysql_config = {"host": "", "port": 3306, 'user': "root", "passwd": "16", "db": "wn", "charset": "utf8"} conn = MySQLdb.connect(**mysql_config) cursor = conn.cursor() target_url_md5 = md5(best_url) cursor.execute("select data_title from crawl_result where data_title=%s",(data_title,)) titles = cursor.fetchone() if titles: pass else: print(best_url, data_title, data_imgs, data_content, data_showtime,author_name, author_imgs,word) cursor.execute("select source_keywords from crawl_result where target_url_md5=%s", (target_url_md5,)) data = cursor.fetchone() if data: source_keywords = data[0] if word not in source_keywords.strip().split(","): source_keywords += ",%s" % word source_keywords = ','.join(list(set(source_keywords.split(",")))) cursor.execute("update crawl_result set source_keywords=%s where target_url_md5=%s", (source_keywords,target_url_md5)) conn.commit() print('ok1111') else: if data_content: cursor = conn.cursor() cursor.execute( "insert into crawl_result(target_url,target_url_md5,addtime,data_title,data_imgs,data_content,data_showtime,data_json,source,source_keywords,state,author_name,author_imgs,author_id,author_json) " "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (best_url, target_url_md5, int(time.time()), data_title, data_imgs, data_content, data_showtime, '', 4, word, 0, author_name, author_imgs, '','')) conn.commit() print('ok') if __name__ == '__main__': from multiprocessing.dummy import Pool kws_list = ['破碎机‘] for keyword , in kws: kws_list.append(keyword) pool = Pool(20) pool.map(crawl_baidu, kws_list) cursor.close() conn.close()

你可能感兴趣的:(爬虫,python)