搜狗搜索微信文章

微信文章的时间是无法直接xpath取,是网页js渲染出来的。用正则取了。

from urllib.parse import urlencode
import requests, re
from requests.exceptions import ConnectionError, ReadTimeout
from lxml import etree
import pymongo
from config import *

client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]

baseurl = 'http://weixin.sogou.com/weixin?'

# 动态修改登录后的cookie
headers = {
    'Cookie': '',
    'Host': 'weixin.sogou.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}

proxy = None

def get_proxy():
    try:
        response = requests.get(PROXY_POOL_URL)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        print('代理池空')
        return None

def get_html(url, count=1):
    print('crawling', url)
    print('trying count', count)
    global proxy
    if count >= MAX_COUNT:
        print("tried too many times")
        return None
    try:
        if proxy:
            proxies = {
                'http' : 'http://' + proxy
            }
            response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies, timeout=10)
        else:
            response = requests.get(url, allow_redirects=False, headers=headers)
        if response.status_code == 200:
            return response.text
        if response.status_code == 302:
            # 加代理
            print('302错误')
            proxy = get_proxy()
            if proxy:
                print('using proxy', proxy)
                return get_html(url)
            else:
                print('get proxy failed')
                return None
    except (ConnectionError,ReadTimeout):
        proxy = get_proxy()
        count += 1
        return get_html(url, count)


def get_index(keyword, pagenumber):
    data = {'query': keyword,
            'type': 2,
            'page': pagenumber
            }

    url = baseurl + urlencode(data)
    html = get_html(url)
    return html

def parse_index(html):
    html = etree.HTML(html)
    urls = html.xpath("//div[@class='news-box']/ul/li/div/h3/a/@href")
    for url in urls:
        yield url

def get_detail(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except ConnectionError:
        return None

def parse_detail(html):
    # print(html)
    htmll = etree.HTML(html)
    title = htmll.xpath("//h2[@class='rich_media_title']/text()")
    if title:
        title = title[0].strip()
    else:
        title = ''.join(htmll.xpath("//span[@id='video_title']/text()"))
    content = htmll.xpath("//div[@id='js_content']")[0].xpath('string(.)').strip()
    date = re.findall(r'var publish_time = \"(.*?)\"', html)[0]
    nickname = htmll.xpath("//span[@class='rich_media_meta rich_media_meta_nickname']/a/text()")
    if nickname:
        nickname = nickname[0].strip()
    else:
        nickname = htmll.xpath("//strong[@class='account_nickname_inner']/text()")[0].strip()
    wechat = ''.join(htmll.xpath("//div[@id='js_profile_qrcode']/div/p[1]/span/text()"))
    return {
        'title' : title,
        'content' : content,
        'date' : date,
        'nickname' : nickname,
        'wechat' : wechat
    }

def save_to_mongo(item):
    if db['articles'].update({'title':item['title']}, {'$set':item}, True):
        print("保存成功")
    else:
        print("保存失败")

def main():
    for page in range(1,101):
        html = get_index(KEYWORD, page)
        if html:
            for url in parse_index(html):
                article_html = get_detail(url)
                if article_html:
                    article_data = parse_detail(article_html)
                    print(article_data)
                    save_to_mongo(article_data)


if __name__ == '__main__':
    main()

config.py

KEYWORD = '风景'

MONGO_URI = 'localhost'
MONGO_DB = 'weixin'

PROXY_POOL_URL = ''

MAX_COUNT = 5

你可能感兴趣的:(搜狗搜索微信文章)