# coding:utf-8
import re
import random
import requests
import time
import hashlib
import json
import MySQLdb
import multiprocessing
from django.utils.http import urlquote
mysql_config = {"host": "*****8",
"port": ,
'user': "root",
"passwd": "***8888",
"db": "won",
"charset": "utf8"}
PC_UAS = [
'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0',
'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',
'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
'Mozilla/5.0 (Linux; Android 7.1.1; MI 6 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/043807 Mobile Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN',
'Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/4G Language/zh_CN',
'Mozilla/5.0 (Linux; Android 6.0.1; SM919 Build/MXB48T; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN',
'Mozilla/5.0 (Linux; Android 5.1.1; vivo X6S A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.6.1.1220(0x26060135) NetType/WIFI Language/zh_CN'
]
TRY_COUNT = 5
from datetime import datetime, timedelta
current_year = datetime.today().strftime("%Y")
def conv_time(t):
if "刚刚" in t:
return int(time.time())
min = int(re.findall('\d+', t)[0])
if u'秒' in t:
s = (datetime.now() - timedelta(seconds=min))
elif u'分钟' in t:
s = (datetime.now() - timedelta(minutes=min))
elif u'小时' in t:
s = (datetime.now() - timedelta(hours=min))
elif u'天' in t:
s = (datetime.now() - timedelta(days=min))
else:
len_time = len(t.split("-"))
if len_time == 3:
s = datetime.strptime(t, "%Y-%m-%d")
elif len_time == 2:
t += ", " + current_year
s = datetime.strptime(t, "%m-%d, %Y")
try:
timstamp = int(time.mktime(s.timetuple()))
except:
return 0
else:
return timstamp
def parse_mblog(mblog):
data_showtime = mblog.get("created_at")
if data_showtime:
data_showtime = conv_time(data_showtime)
else:
data_showtime = 0
b_text = mblog.get("text")
if b_text:
title = re.sub("<.*?>", "", b_text)
imgs_num = int(mblog.get("pic_num", 0))
if imgs_num == 1:
data_imgs = mblog.get("original_pic", "")
else:
pics = [i.get("url") for i in mblog.get("pics", [])]
if pics:
data_imgs = "|||".join(pics)
else:
data_imgs = ""
bid = mblog.get("id", "")
bid_str = mblog.get("bid", "")
else:
title = ""
data_imgs = ""
bid = ""
bid_str = ""
try:
author_name = mblog.get("user").get("screen_name", "")
except:
author_name = ""
try:
author_imgs = mblog.get("user").get("profile_image_url", "")
except:
author_imgs = ""
try:
author_id = mblog.get("user").get("id", "")
except:
author_id = ""
try:
author_gender = mblog.get("user").get("author_gender", "")
except:
author_gender = ""
try:
author_description = mblog.get("user").get("description", "")
except:
author_description = ""
data_json = json.dumps({"bid_num": bid, "text": b_text, "bid_str": bid_str})
author_json = json.dumps({"gender": author_gender,
"description": author_description})
return title, data_imgs, data_showtime, author_id, author_imgs, author_name, data_json, author_json
def dig_weibo(keyword, page):
"""
抓取微博
"""
conn = MySQLdb.connect(**mysql_config)
cursor = conn.cursor()
proxies = {'http': 'http://**********', 'https': 'http://***********'}
pc_headers = {
'User-Agent': random.sample(PC_UAS, 1)[0],
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Pragma': 'no-cache',
'Referer': 'https://m.weibo.cn',
'Connection': 'close',
'Host': 'm.weibo.cn'
}
url = "https://m.weibo.cn/api/container/getIndex?containerid=100103"
count = 0
insert_infos = []
resp_data = {}
req_url = url + urlquote("&type=1&q={}".format(keyword)) + "&page_type=searchall&page={}".format(str(page))
while count < TRY_COUNT:
try:
resp_data = requests.get(req_url, headers=pc_headers, proxies=proxies).json()
except Exception as e:
time.sleep(0.1)
print(e)
count += 1
continue
else:
break
print(req_url)
if resp_data.get("ok") != 1:
return False
cards = resp_data.get("data").get('cards')
if not cards:
return False
mblogs = []
hot_wenzhang = []
for card in cards:
mblog = card.get("mblog")
if mblog:
mblogs.append([card, mblog])
else:
card_group = card.get("card_group")
if not card_group:
continue
else:
for gcard in card_group:
mblog = gcard.get("mblog")
if mblog:
mblogs.append([gcard, mblog])
else:
try:
gcard.get("title_sub")
except:
continue
else:
hot_wenzhang.append(gcard)
addtime = int(time.time())
if hot_wenzhang:
for card in hot_wenzhang:
title = card.get("title_sub")
try:
target_url = card.get("scheme", "").split("?", 1)[0]
except:
target_url = ""
target_url_md5 = ""
else:
target_url_md5 = hashlib.md5(target_url.encode("utf8")).hexdigest()
data_imgs = card.get("pic", "")
try:
author_name, wb_time = card.get("desc").split(" ")
data_showtime = conv_time(wb_time)
except:
data_showtime = 0
author_name = ""
info = [target_url, target_url_md5, addtime, title, data_imgs,
title, data_showtime, "{}", 2, keyword, 0, author_name, "", "", "{}"]
insert_infos.append(info)
if mblogs:
for card, mblog in mblogs:
title, data_imgs, data_showtime, author_id, author_imgs, \
author_name, data_json, author_json = parse_mblog(mblog)
try:
target_url = card.get("scheme", "").split("?", 1)[0]
except:
target_url = ""
target_url_md5 = ""
else:
target_url_md5 = hashlib.md5(target_url.encode("utf8")).hexdigest()
info = [target_url, target_url_md5, addtime, title, data_imgs,
title, data_showtime, data_json, 2, keyword, 0, author_name,
author_imgs, author_id, author_json]
insert_infos.append(info)
if insert_infos:
for info in insert_infos:
if info[3] is None or info[0].startswith("sinanews"):
continue
target_url_md5 = info[1]
cursor.execute("select source_keywords from crawl_result where target_url_md5=%s", (target_url_md5,))
kw_info = cursor.fetchone()
if kw_info:
kw_list = kw_info[0].split(",")
kw_list.append(info[9])
kws = ",".join(list(set(kw_list)))
cursor.execute("""update crawl_result set source_keywords=%s where target_url_md5=%s""", (kws, target_url_md5))
conn.commit()
else:
try:
cursor.execute("""insert into crawl_result(target_url,target_url_md5, addtime,data_title,
data_imgs,data_content,data_showtime,data_json,source,source_keywords,
state,author_name,author_imgs,author_id,author_json)
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
on duplicate key update target_url_md5=values(target_url_md5)""", info)
conn.commit()
except Exception as e:
print(e)
cursor.close()
if insert_infos:
return True
else:
return False
def get_kws():
conn = MySQLdb.connect(**mysql_config)
cursor = conn.cursor()
cursor.execute("select keyword from whool_opinion.keyword order by kid asc")
kw = cursor.fetchall()
return kw
if __name__ == "__main__":
kw = get_kws()
result_list = []
url = "https://m.weibo.cn/api/container/getIndex?containerid=100103"
for k, in kw:
for page in range(1, 100):
ret_info = dig_weibo(k, str(page))
if ret_info:
continue
else:
break