网络爬虫---微信爬虫

 

#微信爬虫  自动获取微信相关文章信息的一种爬虫。伪装浏览器,使用代理ip
import urllib.request
import urllib.error
import time
import re

def use_proxy(url,proxy_addr):
    try:
        req = urllib.request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36')
        proxy = urllib.request.ProxyHandler({'http':proxy_addr})
        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(req).read().decode('utf-8','ignore')
        return data
    except urllib.error.URLError as e:
        if hasattr(e,'code'):
            print(e.code)
        if hasattr(e,'reason'):
            print(e.reason)
        time.sleep(5)
    except Exception as e:
        print('exception:'+str(e))
        time.sleep(1)

key = "python"
proxy = "58.244.59.185:8080"  #代理服务器地址

for i in range(0,10):
    key = urllib.request.quote(key)
    thispageurl = "https://weixin.sogou.com/weixin?query="+key+"&_sug_type_=&sut=1044&lkt=7%2C1567498024564%2C1567498025603&s_from=input&_sug_=y&type=2&sst0=1567498025704&page="+str(i)+"&ie=utf8&w=01019900&dr=1"
    print(thispageurl)
    thispagedata = use_proxy(thispageurl,proxy)
    print(len(str(thispagedata)))

    pat1 = 'data-share="(.*?)"'
    rs1 = re.compile(pat1,re.S).findall(str(thispagedata))
    if(len(rs1) == 0):
        print("此次"+str(i)+"页没成功")
        continue
    for j in range(0,len(rs1)):
        thisurl = rs1[j]
        thisurl = thisurl.replace('amp;','')
        file = "d:/25/"+str(i)+"页第"+str(j)+"篇文章.html"
        thisdata = use_proxy(thisurl,proxy)
    try:
        fh = open(file,'wb')
        fh.write(thisdata)
        fh.close()
        print("第"+str(i)+"页第"+str(j)+"篇文章成功")
    except Exception as e:
        print(e)
        print("第"+str(i)+"页第"+str(j)+"篇文章失败")

你可能感兴趣的:(网络爬虫---微信爬虫)