代码仅供学习交流,请勿用于非法用途
# -*- coding:utf-8 -*-
import requests
from watchdog.observers import Observer
from watchdog.events import *
import json
from bs4 import BeautifulSoup
import time
from queue import Queue
import threading
import hashlib
import MySQLdb
import urllib.parse
import re
'''
@Author :wanglei
@Date :2019/10/10
@Desc :搜狗新闻爬取
'''
#---------------------------------------------------------------------------
threadNum = 1
mysql_user = "root"
mysql_password = "root"
mysql_database = "news"
mysql_table = "wx_sou_news"
monitorPath = r"c:/users/asus/PycharmProjects/it002/crawler/wxSou/category/"
#---------------------------------------------------------------------------
headers = {
"Cookie": "SUV=005A28DAABD72B4D5D1D3A358EDDF616; CXID=2B6C8A624E9C1A5A5FA5AAEA5CE40242; SUID=4D2BD7AB3220910A000000005D1CCB39; YYID=18AFB43C3A5F3B3AA6478C7D6E3167A1; pgv_pvi=6812688384; weixinIndexVisited=1; sct=1; QIDIANID=Q4l+8p+7M86kIsIyKi6QuMMxpv2kxXbyv7+NiKkBnBxeMNoejOSOJh0JOzT5VdC8; SMYUV=1567084685160551; UM_distinctid=16cdd86a76bb81-06651bda9ec90a-4d045769-1fa400-16cdd86a76c963; GOTO=Af99046; ad=Hjx0Nlllll2NtCV$lllllVCT74llllllNYkMnZllll9lllllj0ULn5@@@@@@@@@@; wuid=AAFeRCfSKQAAAAqHEEfWUAAAkwA=; FREQUENCY=1568125047430_1; front_screen_resolution=1920*1080; usid=kY0HNhvMU5jsLQMx; IPLOC=CN5101; ld=MZllllllll2NJXlklllllVL4ltYlllllHIjeFlllll9lllllVklll5@@@@@@@@@@; LSTMV=146%2C33; LCLKINT=3197; SNUID=533ADB0C6164F215F13E1515562E4F098; ABTEST=8|1569627405|v1",
"Host": "weixin.sogou.com",
"Referer": "https://weixin.sogou.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
"X-Requested-With": "XMLHttpRequest"
}
class FileEventHandler(FileSystemEventHandler):
def on_any_event(self, event):
pass
def rmFile(self, path):
try:
os.remove(path)
except Exception as e:
pass
def checkPathStatus(self, path):
if "jb_tmp" in path or "jb_old" in path or not os.path.exists(path):
return False
return True
def requestsUrl(self, path):
if self.checkPathStatus(path):
categoryQueue = getCategoryQueue(path)
self.rmFile(path)
for i in range(threadNum):
w = wxSouSpider(categoryQueue)
w.start()
def on_moved(self, event):
if not event.is_directory:
path = event.dest_path
self.requestsUrl(path)
def on_created(self, event):
if not event.is_directory:
path = event.src_path
self.requestsUrl(path)
def on_modified(self, event):
if not event.is_directory:
path = event.src_path
self.requestsUrl(path)
class wxSouSpider(threading.Thread):
def __init__(self, categoryQueue, *args, **kwargs):
super(wxSouSpider, self).__init__(*args, **kwargs)
self.categoryQueue = categoryQueue
def getHTML(self, url):
while True:
try:
resp = requests.get(url, headers=headers, timeout=10)
return resp.content.decode("utf-8")
except Exception as e:
pass
def getCategoryUrl(self, category):
return "https://weixin.sogou.com/pcindex/pc/pc_" + str(category) + "/pc_" + str(category) + ".html"
def md(self, s):
return hashlib.md5(str(s).encode("utf-8")).hexdigest()
def getDate(self, ts):
try:
nowTs = int(time.time())
diffTime = nowTs - int(ts)
num = 0
company = ""
if diffTime < 60:
num = diffTime
company = "秒"
elif diffTime < 3600:
num = diffTime // 60
company = "分钟"
elif diffTime < 86400:
num = diffTime // 3600
company = "小时"
else:
num = diffTime // 86400
company = "天"
return str(num) + company + "前"
except Exception as e:
return None
def getNewsList(self, url, catrgory):
html = self.getHTML(url)
soup = BeautifulSoup(html, "html.parser")
newsList = []
try:
lis = soup.find("ul", attrs={"class": "news-list"}).find_all("li")
for li in lis:
metaNews = {}
metaNews['category'] = catrgory
metaNews['hs'] = ""
try:
metaNews['hs'] = self.md(metaNews['banner'])
except Exception as e:
pass
txtBox = ""
try:
txtBox = li.find("div", attrs={"class": "txt-box"})
except Exception as e:
continue
a = ""
try:
a = txtBox.find("h3").find("a")
except Exception as e:
continue
metaNews['title'] = ""
try:
metaNews['title'] = a.text
except Exception as e:
pass
metaNews['url'] = ""
try:
metaNews['url'] = str(a['href']).replace("×tamp", "×tamp")
except Exception as e:
pass
metaNews['description'] = ""
try:
metaNews['description'] = txtBox.find("p", "txt-info").text
except Exception as e:
pass
metaNews['source'] = ""
try:
metaNews['source'] = txtBox.find("div", "s-p").find("a").text
except Exception as e:
pass
metaNews['date'] = ""
try:
metaNews['date'] = self.getDate(txtBox.find("div", "s-p")['t'])
except Exception as e:
pass
newsList.append(metaNews)
return newsList
except Exception as e:
return None
def pipLine(self, news):
try:
conn = MySQLdb.connect(user=mysql_user, host="127.0.0.1", password=mysql_password, database=mysql_database, charset='utf8')
cursor = conn.cursor()
cursor.execute("insert into " + mysql_table + "(category, banner, title, url, description, source, `date`, hs) "
"values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
(news['category'], news['banner'], news['title'], news['url'], news['description'], news['source'], news['date'], news['hs'])
)
conn.commit()
return True
except Exception as e:
return False
def run(self):
while True:
if self.categoryQueue.empty():
break
category = self.categoryQueue.get()
url = self.getCategoryUrl(category)
newsList = self.getNewsList(url, category)
if newsList:
for news in newsList:
status = self.pipLine(news)
if not status:
break
def getCategoryQueue(path):
try:
categoryQueue = Queue(0)
with open(path, "r", encoding="utf-8") as f:
for line in f:
try:
categoryQueue.put(line.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", ""))
except Exception as e:
continue
return categoryQueue
except Exception as e:
pass
if __name__ == '__main__':
observer = Observer()
event_handler = FileEventHandler()
observer.schedule(event_handler, monitorPath, True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()