Python反爬虫-----随机切换浏览器请求

1.导入re模块:正则

2.导入urllib.request模块:爬虫

3.导入random模块:随机数

4.extend():列表末尾一次性追加另一个序列中的多个值

5.findall():相匹配的全部字串,返回形式为数组

6.compile():将一个字符串编译为字节

import urllib
import urllib.request
import random
import re



user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
              'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
              'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
              'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
              'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
              'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
              'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
              'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
              'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
              'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']


#抓取网页源代码
def  getpage(path):
    #冒充其他浏览器
    request=urllib.request.Request(url=path,headers={"User-Agent":random.choice(user_agent)})
    response=urllib.request.urlopen(request) #会话,打开网页
    data=response.read().decode("gbk")#读取网页并且编码
    return data


#生成网页列表,44个分页
def  runpage(min,max):
    for page in  range(min,max):
        mystr="http://quote.stockstar.com/fund/stock_3_1_"+str(page)+".html"
        print(mystr)

        data = getpage( mystr) #抓取每一个页面
        #print("data",data)
        showdata(gettbody(data))#显示数据



def  gettbody(data):
    pat=re.compile("[\s\S]*")
    body=pat.findall(data)#提取body的所有数据

    patnew=re.compile(">(.*?)<") #提取表格内部数据
    datalist=patnew.findall(body[0]) #第一个
    return    datalist


def  showdata(datalist):
    newdatalist=datalist.copy() #深拷贝
    for  data  in datalist:
        if data=="":
            newdatalist.remove(data) #删除空格

    myindex=-1
    for  i  in range(len(newdatalist)):
        newdatalist[i].strip()#删除前后空格
        if newdatalist[i]=="市盈率":
            myindex=i+1

    if  myindex==-1:
        myindex==0
    for  i  in range(myindex,len(newdatalist),10):
        for j  in range(10):
            if  i+j < len(newdatalist):
                print(format(newdatalist[i+j],"<15s")    ,end="  ")
        print("")
    #print(newdatalist)



#runpage(1,45)
#print(getpage("http://quote.stockstar.com/stock/sha_3_1_1.html"))

#data =getpage("http://quote.stockstar.com/stock/sha_3_1_1.html")

runpage(1,27)

你可能感兴趣的:(Python)