微信爬虫,爬取网页信息(使用代理和模拟浏览器)

#http://weixin.sogou.com/
import re
import urllib.request
import time
import urllib.error
import urllib.request

import scipy
#自定义函数,功能为使用代理服务器爬一个网址
def use_proxy(proxy_addr,url):
    #建立异常处理机制
    try:
        req=urllib.request.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
        proxy= urllib.request.ProxyHandler({'http':proxy_addr})
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(req).read()
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
        #若为URLError异常,延时10秒执行
        time.sleep(10)
    except Exception as e:
        print("exception:"+str(e))
        #若为Exception异常,延时1秒执行
        time.sleep(1)

#设置关键词
key="Python"
#设置代理服务器,该代理服务器有可能失效,读者需要换成新的有效代理服务器
proxy="120.76.231.27:3128"
#爬前10页
for i in range(1,10):
    key=urllib.request.quote(key)
    thispageurl= 'http://weixin.sogou.com/weixin?query='+key+'&_sug_type_=&sut=4983&lkt=10%2C1527762297236%2C1527762302210&s_from=input&_sug_=y&type=2&sst0=1527762302313&page='+str(i)+'&ie=utf8&w=01019900&dr=1'
    print(thispageurl)
    thispagedata=use_proxy(proxy,thispageurl)
    print(len(str(thispagedata)))

    pat1='    rs1=re.compile(pat1,re.S).findall(str(thispagedata))
    if(len(rs1)==0):
        print("此次("+str(i)+"页)没成功")
        continue
    for  j in range(0,len(rs1)):
        thisurl=rs1[j]
        thisurl=thisurl.replace("amp;","")
        print(thisurl)
        file="F:/news/第"+str(i)+"页第"+str(j)+"篇文章.html"
        thisdata=use_proxy(proxy,thisurl)
        try:
            fh=open(file,"wb")
            fh.write(thisdata)
            fh.close()
            print("第"+str(i)+"页第"+str(j)+"篇文章成功")
        except Exception as e:
            print(e)
            print("第"+str(i)+"页第"+str(j)+"篇文章失败")

你可能感兴趣的:(Python,python网络编程)