__author__ = 'fen' # coding=utf8 import urllib2 import urllib from StringIO import StringIO import bs4 def base1(url): content=urllib2.urlopen(url).read return content def agent(url): proxy_support=urllib2.ProxyHandler({'http':url}) #通过代理重定向请求 opener=urllib2.build_opener(proxy_support,urllib2.HTTPHandler) urllib2.install_opener(opener) content=urllib2.urlopen(url).read() #添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题 i_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'} req = urllib2.Request(url,headers=i_headers) html = urllib2.urlopen(req) if url == html.geturl(): html = html.read() return html return content def para1(url,page): import requests # 用'?'的post header_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0' #头信息可以修改成多种 headers = {'User-Agent':header_agent} #某些网站反感爬虫的到访,于是对爬虫一律拒绝请求。这时候我们需要伪装成浏览器,这可以通过修改http包中的header pqyload={'curr_Page':page} # curr_Page,condition.pageNo ?后面连接的参数 r=requests.get(url,params=pqyload) html=requests.get(url,headers = headers,params=pqyload).text return html def para2(url,page,rn1,rn2): #url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list' header_agent = "Mozilla/5."+str(rn1)+"(X11; Ubuntu; Linux x86_32; rv:37.0) Gecko/20100101 Firefox/36."+str(rn2) headers = {'User-Agent':header_agent} #将header信息随机生成,以免访问受阻 values={ 'random':'1440940998226', 'cxyzm':'no', 'page.currentPageNo':str(page), } # &的情况 data=urllib.urlencode(values) #进行参数封装 req=urllib2.Request(url,data,headers=headers) req.add_header('Accept-encoding', 'gzip') response = urllib2.urlopen(req) html =StringIO(response.read()) #源码有可能被压缩 通过这个可以看到更真实的源码 html=bs4.BeautifulSoup(html) #bs 自动编码,也可以通过一下获取源代码编码方式,下面的方法较慢 # charset=chardet.detect(html) # code=charset['encoding']#获取源代码的编码方式 # text=str(html).decode(code,'ignore').encode('utf-8') html=str(html) #用到bs包的 先将强制转换成str return html #print agent('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action?curr_Page=2') #print para1('http://gsxt.ngsh.gov.cn/ECPS/enterpriseAbnAction_enterpriseList.action',2) #print para2(url='http://gsxt.hnaic.gov.cn/notice/search/ent_except_list',page=2,rn1=3,rn2=2)