Python爬虫系列之爬取微信公众号新闻数据

Python爬虫系列之爬取微信公众号新闻数据

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

微信请扫描下方二维码

在这里插入图片描述

代码仅供学习交流,请勿用于非法用途

  • 监控指定目录文件,对文件指定数据进行爬取

一、代码实现

# -*- coding:utf-8 -*-
import requests
from watchdog.observers import Observer
from watchdog.events import *
import json
from bs4 import BeautifulSoup
import time
from queue import Queue
import threading
import hashlib
import MySQLdb
import urllib.parse
import re

'''
    @Author  :wanglei
    @Date    :2019/10/10
    @Desc    :搜狗新闻爬取
'''
#---------------------------------------------------------------------------
threadNum = 1
mysql_user = "root"
mysql_password = "root"
mysql_database = "news"
mysql_table = "wx_sou_news"
monitorPath = r"c:/users/asus/PycharmProjects/it002/crawler/wxSou/category/"
#---------------------------------------------------------------------------

headers = {
    "Cookie": "SUV=005A28DAABD72B4D5D1D3A358EDDF616; CXID=2B6C8A624E9C1A5A5FA5AAEA5CE40242; SUID=4D2BD7AB3220910A000000005D1CCB39; YYID=18AFB43C3A5F3B3AA6478C7D6E3167A1; pgv_pvi=6812688384; weixinIndexVisited=1; sct=1; QIDIANID=Q4l+8p+7M86kIsIyKi6QuMMxpv2kxXbyv7+NiKkBnBxeMNoejOSOJh0JOzT5VdC8; SMYUV=1567084685160551; UM_distinctid=16cdd86a76bb81-06651bda9ec90a-4d045769-1fa400-16cdd86a76c963; GOTO=Af99046; ad=Hjx0Nlllll2NtCV$lllllVCT74llllllNYkMnZllll9lllllj0ULn5@@@@@@@@@@; wuid=AAFeRCfSKQAAAAqHEEfWUAAAkwA=; FREQUENCY=1568125047430_1; front_screen_resolution=1920*1080; usid=kY0HNhvMU5jsLQMx; IPLOC=CN5101; ld=MZllllllll2NJXlklllllVL4ltYlllllHIjeFlllll9lllllVklll5@@@@@@@@@@; LSTMV=146%2C33; LCLKINT=3197; SNUID=533ADB0C6164F215F13E1515562E4F098; ABTEST=8|1569627405|v1",
    "Host": "weixin.sogou.com",
    "Referer": "https://weixin.sogou.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
    "X-Requested-With": "XMLHttpRequest"
}


class FileEventHandler(FileSystemEventHandler):
    def on_any_event(self, event):
        pass

    def rmFile(self, path):
        try:
            os.remove(path)
        except Exception as e:
            pass

    def checkPathStatus(self, path):
        if "jb_tmp" in path or "jb_old" in path or not os.path.exists(path):
            return False
        return True

    def requestsUrl(self, path):
        if self.checkPathStatus(path):
            categoryQueue = getCategoryQueue(path)
            self.rmFile(path)
            for i in range(threadNum):
                w = wxSouSpider(categoryQueue)
                w.start()

    def on_moved(self, event):
        if not event.is_directory:
            path = event.dest_path
            self.requestsUrl(path)

    def on_created(self, event):
        if not event.is_directory:
            path = event.src_path
            self.requestsUrl(path)

    def on_modified(self, event):
        if not event.is_directory:
            path = event.src_path
            self.requestsUrl(path)


class wxSouSpider(threading.Thread):
    def __init__(self, categoryQueue, *args, **kwargs):
        super(wxSouSpider, self).__init__(*args, **kwargs)
        self.categoryQueue = categoryQueue

    def getHTML(self, url):
        while True:
            try:
                resp = requests.get(url, headers=headers, timeout=10)
                return resp.content.decode("utf-8")
            except Exception as e:
                pass

    def getCategoryUrl(self, category):
        return "https://weixin.sogou.com/pcindex/pc/pc_" + str(category) + "/pc_" + str(category) + ".html"

    def md(self, s):
        return hashlib.md5(str(s).encode("utf-8")).hexdigest()

    def getDate(self, ts):
        try:
            nowTs = int(time.time())
            diffTime = nowTs - int(ts)
            num = 0
            company = ""
            if diffTime < 60:
                num = diffTime
                company = "秒"
            elif diffTime < 3600:
                num = diffTime // 60
                company = "分钟"
            elif diffTime < 86400:
                num = diffTime // 3600
                company = "小时"
            else:
                num = diffTime // 86400
                company = "天"
            return str(num) + company + "前"
        except Exception as e:
            return None

    def getNewsList(self, url, catrgory):
        html = self.getHTML(url)
        soup = BeautifulSoup(html, "html.parser")
        newsList = []
        try:
            lis = soup.find("ul", attrs={"class": "news-list"}).find_all("li")
            for li in lis:
                metaNews = {}
                metaNews['category'] = catrgory
                metaNews['hs'] = ""
                try:
                    metaNews['hs'] = self.md(metaNews['banner'])
                except Exception as e:
                    pass
                txtBox = ""
                try:
                    txtBox = li.find("div", attrs={"class": "txt-box"})
                except Exception as e:
                    continue
                a = ""
                try:
                    a = txtBox.find("h3").find("a")
                except Exception as e:
                    continue
                metaNews['title'] = ""
                try:
                    metaNews['title'] = a.text
                except Exception as e:
                    pass
                metaNews['url'] = ""
                try:
                    metaNews['url'] = str(a['href']).replace("×tamp", "×tamp")
                except Exception as e:
                    pass
                metaNews['description'] = ""
                try:
                    metaNews['description'] = txtBox.find("p", "txt-info").text
                except Exception as e:
                    pass
                metaNews['source'] = ""
                try:
                    metaNews['source'] = txtBox.find("div", "s-p").find("a").text
                except Exception as e:
                    pass
                metaNews['date'] = ""
                try:
                    metaNews['date'] = self.getDate(txtBox.find("div", "s-p")['t'])
                except Exception as e:
                    pass
                newsList.append(metaNews)
            return newsList
        except Exception as e:
            return None

    def pipLine(self, news):
        try:
            conn = MySQLdb.connect(user=mysql_user, host="127.0.0.1", password=mysql_password, database=mysql_database, charset='utf8')
            cursor = conn.cursor()
            cursor.execute("insert into " + mysql_table + "(category, banner, title, url, description, source, `date`, hs) "
                           "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" %
                           (news['category'], news['banner'], news['title'], news['url'], news['description'], news['source'], news['date'], news['hs'])
            )
            conn.commit()
            return True
        except Exception as e:
            return False

    def run(self):
        while True:
            if self.categoryQueue.empty():
                break
            category = self.categoryQueue.get()
            url = self.getCategoryUrl(category)
            newsList = self.getNewsList(url, category)
            if newsList:
                for news in newsList:
                    status = self.pipLine(news)
                    if not status:
                        break


def getCategoryQueue(path):
    try:
        categoryQueue = Queue(0)
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    categoryQueue.put(line.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", ""))
                except Exception as e:
                    continue
        return categoryQueue
    except Exception as e:
        pass


if __name__ == '__main__':
    observer = Observer()
    event_handler = FileEventHandler()
    observer.schedule(event_handler, monitorPath, True)
    observer.start()
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()

小程序爬虫接单、app爬虫接单、网页爬虫接单、接口定制、网站开发、小程序开发 > 点击这里联系我们 <

你可能感兴趣的:(Python)