Python request爬虫框架

Python request爬虫基本框架

The Website is api…

爬冲基础

  1. requests ,爬取页面Html页面请求页面;
  2. robots,网络爬虫标准;
  3. Beautiful Soup,解析HTML页面;
  4. Re 正则表达式提取;
  5. Scrapy[^1] 大型框架;

###requests 基本函数

#import requests
#url='http://www.baidu.com'
#response = requests.get(url)
#print(response.status_code)  #200
#print(type(response))   #
#print(response.headers)  #{'Date': 'Mon, 21 Jan 2019 12:29:05 GMT', 'Connection': 'Keep-Alive'
#print(response.encoding)   #ISO-8859-1   从http中猜测响应编码,,header中不存在charset,默认编码为这个
#print(response.apparent_encoding) #utf-8  备用编码

requests 基本框架

#通用代码框架格式
import requests

def getHtmlText(url):
    try:
        response=requests.get(url,timeout = 30)
        response.raise_for_status() #如果状态不是200,引发HTTPERROR异常
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return 'Exceptional error occurred'

if __name__ == '__main__':
    url='requests.get("http://file.dl01.zxxk.com//OutFile/20190122/11265993164061984.doc?mkey=5e997e6187fcdc82217253aed14a5676705")'
    print(getHtmlText(url))

图片下载基本框架

import requests,os
url = "http://image.ngchina.com.cn/2019/0122/20190122124507342.jpg"
root="D://pics//"
path=root+url.split("/")[-1]
try:
    if not os.path.exists(root):  #判断文件夹是否存在
        os.mkdir(root)
    if not os.path.exists(path): #判断文件不存在则下载文件
        r=requests.get(url)
        with open(path,'wb') as f:
            f.write(r.content)
            f.close()
            print('dowmload success')
    else:
        print("文件已存在")
except:
    print("爬取失败")


用requests-bs4爬取中国大学网站:


#pip install beautifulsoup4
# from bs4 import BeautifulSoup  #导入Bs4里面的BeautifulSoup类
# soup = BeautifulSoup("data","html.pasrser")  #bs4的html解析,lxmnl的html,lxml的xml解析 pip install lxml
# print(soup.prettify())
#<>.find_all(name,attrs,recursive,string)
#name对标签名称检索字符串,attars对标签属性检索字符串,recursive,是否对所有子孙检索,默认True string ,检索<>..中字符串区域

import requests,bs4
from bs4 import BeautifulSoup

def getHtmlText(url):  #获取网页内容
    try:

        response = requests.get(url,timeout = 30)
        response.status_code
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return 'Exceptional error occurred'

def fillUnivList(html,uinfo):  #提取网页内容信息
    soup = BeautifulSoup(html,'html.parser')
    for tr in soup.find('tbody').children: #找到tbody标签,然后对下面子孙进遍历
        if isinstance(tr,bs4.element.Tag): #如果对象的类型与参数二的类型(Tag)相同则返回 True,否则返回 False。。
            tds=tr('td')
            uinfo.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string]) #string,tds[3].string

def printUnVlist(uinfo,num): #打印输出结果
    tplt = "{}\t{}\t{}\t{}"
    ulist=[]
    print(tplt.format("排名","学校名称","地址","总分",chr(12288)))
    for i in range(num):
        u=ulist[i]
        print(tplt.format(u[0],u[1],u[2],u[3],chr(12288)))



def main():
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
    uinfo = []
    html = getHtmlText(url)
    fillUnivList(html,uinfo)
    printUnVlist(uinfo,3)  #打印N个排名


if __name__ == '__main__':
    main()

ruquests爬取淘宝商品信息比价格


# 正则表达式 用于表达字符串
# regex = re.compile(pattern,flage=0) ,pattern将表达式字符串或源生字符表示,flage 标记使用 I ,StopAsyncIteration
# 第一种 import 模块名
# 第二张 from 模块名 import 函数名,函数名
import requests,time
import random,re,json


def getHTMLText(url):
    cookies=dict(cookies_are='cooks')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
    proxies = ["115.218.222.64:9000","61.164.39.67:53281"] #没有使用字典的原因是 因为字典中的键是唯一的 http 和https 只能存在一个 所以不建议使用字典
    try:
        response=requests.get(url,timeout=50,cookies=cookies,proxies={'http':random.choice(proxies)},headers=headers)
        print(response.raise_for_status())
        
        response.encoding = response.apparent_encoding
        print(response.url)
        return response.text
    except:
        return "Exceptional error occurred"

def parsePage(infoList,html):
    try:
        price = re.findall(r'\"view_price\":\"[\d\.]*\"',html) #价格
        title = re.findall(r'\"raw_title\"\:\".*?\"',html)#标题
        address = re.findall(r'\"item_loc\"\:\".*?\"',html) # 地址
        payments = re.findall(r'\"view_sales\"\:\".*?\"',html) #付款人数
        ShopName = re.findall(r'\"nick\"\:\".*?\"',html) # 店铺名
        PIC_img = re.findall(r'\"pic_url\"\:\".*?\"',html)  #图片地址
        for i in range(len(price)):
            INfoDICT = {}
            INfoDICT['view_price'] = eval(price[i].split(":")[1])
            INfoDICT['raw_title']= eval(title[i].split(":")[1])
            INfoDICT['item_loc'] = eval(address[i].split(":")[1])
            INfoDICT['view_sales'] = eval(payments[i].split(":")[1])
            INfoDICT['nick'] = eval(ShopName[i].split(":")[1])
            INfoDICT['pic_url'] = eval(PIC_img[i].split(":")[1])
            infoList.append(INfoDICT)
        
            # print(INfoDICT)

    except:
        print("except Error") 


def PrintGoodList(INfoDICT):
    tplt = "{:14}\t{:4}\t{:4}\t{:4}\t{:8}\t{:8}\t{:30}"
    print(tplt.format("序号", "商品名称", "价格","地址","付款人数","店铺名","图片地址"))
    count = 0
    for g in INfoDICT:
        count = count + 1
        print(tplt.format(count, g[1], g[0], g[2],  g[3],  g[4],g[5]))



def Getmain(keyword):
    # keyword = input("请输入关键词:")
    depth = 2
    start_url = 'https://s.taobao.com/search?'+keyword
    infoList = []
    for i in range(depth):
        try:
            url = start_url+"&s="+str(44*i) 
            # time.time(1)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    json.dumps(infoList,ensure_ascii=False)
    #PrintGoodList(infoList)
    # INfoDICT['data'] = infoList
    
# main()

你可能感兴趣的:(Python,爬虫应用例子)