# -*- coding: utf-8 -*-
import random
import requests
from pyquery import PyQuery as pq
from urllib.parse import urlencode, quote
import uuid
import time
import re
import time
import hashlib
from utils.img_to_tencent import img_to_tencent
def md5(str):
return hashlib.md5(str.encode('utf-8')).hexdigest()
PC_UAS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36'
]
headers = {
'User-Agent': random.sample(PC_UAS, 1)[0],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://www.baidu.com/',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Connection': 'close',
}
def get_k_h(url):
b = int(random.random() * 100) + 1
a = url.find("url=")
url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
return url
a_str = '''
uigs_cl first_click
uigs_refer https://weixin.sogou.com/
uigs_productid vs_web
terminal web
vstype weixin
pagetype result
channel result_article
s_from input
sourceid
type weixin_search_pc
uigs_cookie SUID,sct
weixintype 2
exp_status -1
exp_id_list 0_0
wuid 0071440178DB40975D3C689EE37C6784
rn 1
login 0
uphint 1
bottomhint 1
page 1
exp_id null_0-null_1-null_2-null_3-null_4-null_5-null_6-null_7-null_8-null_9
time 20914
'''
def str_to_dict(a_str):
'''
将a_str形式的字符串转化为字典形式;
:param a_str:
:return:
'''
str_a = list(i for i in a_str.split('\n') if i != '')
str_b = {}
for a in str_a:
a1 = a.split('\t')[0]
a2 = a.split('\t')[1]
str_b[a1] = a2
return str_b
b_data = str_to_dict(a_str)
def get_suva(sunid):
'''
根据sunid来获取suv参数;并添加到cookie
:param a: sunid
:return:
'''
b_data['snuid'] = sunid.split('=')[-1]
b_data['uuid'] = uuid.uuid1()
b_data['uigs_t'] = str(int(round(time.time() * 1000)))
url_link = 'https://pb.sogou.com/pv.gif?' + urlencode(b_data)
res = requests.get(url_link)
cookie_s = res.headers['Set-Cookie'].split(',')
cookie_list_s = []
for i in cookie_s:
for j in i.split(','):
if 'SUV' in j:
cookie_list_s.append(j)
else:
continue
# print(cookie_list_s[0].split(';')[0])
headers['Cookie'] = cookie_list_s[0].split(';')[0]
def crawl_baidu(word):
b_data.update({'query':word})
pc_headers = {
'User-Agent': random.sample(PC_UAS, 1)[0],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://weixin.sogou.com/',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Host': 'weixin.sogou.com',
"Accept - Language": "zh - CN, zh;q = 0.9",
}
dat = []
for i in range(1,3):
html_text = ''
resp = ''
for j in range(3):
url = 'https://weixin.sogou.com/weixin?type=2&s_from=input&query=%s&_sug_type_=&s_from=input&_sug_=n&type=2&page=%s&ie=utf8' % (
word,int(i))
try:
proxies = {'http': 'h'}
resp = requests.get(url,headers=pc_headers,timeout=25,proxies=proxies)
html_text = resp.content
except Exception as e:
print(e)
continue
break
if html_text:
cookies = resp.headers['Set-Cookie'].split(';')
cookie_list_long = []
cookie_list2 = []
doc = pq(html_text)
divs = doc('.txt-box').items()
for j in divs:
url_list11 = pq(html_text)('.news-list li').items()
img_list = []
for i in url_list11:
# 提取href属性标签
try:
url_list12 = pq(i('.img-box img').attr('src'))
if not url_list12:
data_imgs = ''
else:
url_list12 = str(url_list12).replace('', '').replace('
', '').replace('amp;', '')
data_imgs = str(url_list12).replace('//img01.sogoucdn.com/net/a/04/link?appid=100520033&url=', '')
img_list.append(data_imgs)
except:
pass
data_title = j('h3 a').text()
data_content = j('.txt-info').text()
show_info = j('.s-p').text()
show_info = str(show_info).replace("document.write(timeConvert('",',').replace("'))",'')
author_name = show_info.split(',')[0]
data_showtime = show_info.split(',')[1]
for cookie in cookies:
cookie_list_long.append(str(cookie).split(','))
for i in cookie_list_long:
for se in i:
if 'SUID' in se or 'SNUID' in se:
cookie_list2.append(se)
sunid = cookie_list2[0].split(';')[0]
get_suva(sunid)
# 构造动态Cookies
headers['Cookie'] = headers['Cookie'] + ';' + ';'.join(cookie_list2)
target_url = j('h3 a').attr('href')
b = int(random.random() * 100) + 1
a = target_url.find("url=")
result_link = target_url + "&k=" + str(b) + "&h=" + target_url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
a_url = "https://weixin.sogou.com" + result_link
second_url = ''
for i in range(3):
try:
second_url = requests.get(a_url, headers=headers,proxies=proxies,timeout=20).text
except:
continue
break
# 获取真实url
url_text = re.findall("\'(\S+?)\';", second_url, re.S)
best_url = ''.join(url_text)
best_url = str(best_url).replace('&from=inner', '')
author_imgs = ''
try:
data_imgs = img_list[len(dat)]
except:
data_imgs = ''
# img_to_tencent(str(data_imgs))
if 'http://mp' in best_url:
dat.append([word])
mysql_config = {"host": "",
"port": 3306,
'user': "root",
"passwd": "16",
"db": "wn",
"charset": "utf8"}
conn = MySQLdb.connect(**mysql_config)
cursor = conn.cursor()
target_url_md5 = md5(best_url)
cursor.execute("select data_title from crawl_result where data_title=%s",(data_title,))
titles = cursor.fetchone()
if titles:
pass
else:
print(best_url, data_title, data_imgs, data_content, data_showtime,author_name, author_imgs,word)
cursor.execute("select source_keywords from crawl_result where target_url_md5=%s", (target_url_md5,))
data = cursor.fetchone()
if data:
source_keywords = data[0]
if word not in source_keywords.strip().split(","):
source_keywords += ",%s" % word
source_keywords = ','.join(list(set(source_keywords.split(","))))
cursor.execute("update crawl_result set source_keywords=%s where target_url_md5=%s",
(source_keywords,target_url_md5))
conn.commit()
print('ok1111')
else:
if data_content:
cursor = conn.cursor()
cursor.execute(
"insert into crawl_result(target_url,target_url_md5,addtime,data_title,data_imgs,data_content,data_showtime,data_json,source,source_keywords,state,author_name,author_imgs,author_id,author_json) "
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(best_url, target_url_md5, int(time.time()), data_title, data_imgs, data_content, data_showtime,
'', 4, word, 0, author_name, author_imgs, '',''))
conn.commit()
print('ok')
if __name__ == '__main__':
from multiprocessing.dummy import Pool
kws_list = ['破碎机‘]
for keyword , in kws:
kws_list.append(keyword)
pool = Pool(20)
pool.map(crawl_baidu, kws_list)
cursor.close()
conn.close()