pyhthon爬虫

opener

  1. 新建opener
    因为常规的urlopen()不支持代理,cookie和https等高级功能

    from urllib import request,parse
    import ssl
    import random
    
    ssl._create_default_https_context = ssl._create_unverified_context
    //创建一个httpHander对象处理https请求
    http_handler = request.HTTPSHandler()
    //创建一个opener对象
    opener=request.build_opener(http_handler)
    url='https//www.liepin.com'
    str={"key":"python"}
    url=url+parse.urlencode(str)
    //构建request请求
    request=request.Request(url)
    //调用自定义的opener对象发送request请求
    f=opener.open(request)
    //读取数据
    f.read().decode()
  2. 给request添加请求头

    ua_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
    ]
    //随机获取一个头
    user_agent=random.choice(ua_list)
    request.add_header('user-agent',user_agent)
  3. ProxyHandler处理器,设置代理ip

    from urllib import request,parse
    import urllib
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context  
    
    //构建一个有代理ip的handler
    http_handler=request.ProxyHandler({"http":"110.73.1.105:8123"})
    opener=request.build_opener(http_handler)
    request=request.Request(url)
    f=opener.open(request)
    f.read()

Requests

继承了urllib的所有特性,比urllib更简单

  1. get请求

    import requests
    import random
    url='http://search.jiayuan.com/v2/index.php?key=%E7%90%86%E8%B4%A2&'
    qs={
    "sex":"f"
    }
    //从列表中读取一个header
    user_agent=random.choice(ua_list)
    headers={"user-agent":user_agent}
    
    #params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode()
    
    with requests.get(url=url,params=qs,headers=headers) as response:
    //response.text查看响应内容
        data=response.text
        //查看响应头部的编码
        print(response.encoding)
        with open('tb.html','w',encoding='utf-8') as fp:
            fp.write(data)
            fp.close()      
  2. post请求

    with requests.post(url,data=data) as response:
    //json文件可以直接用requests.json
    print(response.json)
  3. 如果要使用代理,加入proxies

    requests.get(peoxies=proxies)

页面解析

  1. 正则表达式

    import re 
    with open('tb.html','r',encoding='utf-8') as fp:
        data=fp.read()
        re=r'"text": "(.*)",'
        reg=re.compile(re)
        res=reg.findall(data)
        print(res)
  2. xml
    xml格式示例文档

    
    <bookstore> 
      <book category="cooking"> 
        <title lang="en">Everyday Italiantitle>  
        <author>Giada De Laurentiisauthor>  
        <year>2005year>  
        <price>30.00price> 
      book>  
      <book category="children"> 
        <title lang="en">Harry Pottertitle>  
        <author>J K. Rowlingauthor>  
        <year>2005year>  
        <price>29.99price> 
      book>  
      <book category="web"> 
        <title lang="en">XQuery Kick Starttitle>  
        <author>James McGovernauthor>  
        <author>Per Bothnerauthor>  
        <author>Kurt Cagleauthor>  
        <author>James Linnauthor>  
        <author>Vaidyanathan Nagarajanauthor>  
        <year>2003year>  
        <price>49.99price> 
      book> 
      <book category="web" cover="paperback"> 
        <title lang="en">Learning XMLtitle>  
        <author>Erik T. Rayauthor>  
        <year>2003year>  
        <price>39.95price> 
      book> 
    bookstore>
  3. 选择器

    表达式 描述
    nodename 选取此节点的所有子节点
    / 从根节点开始选
    // 在全文档中找此节点
    . 当前节点
    .. 当前节点的父节点
    @ 选取属性

范例

```
bookstore   选取 bookstore 元素的所有子节点(如果只有一个的话)。
/bookstore  选取根元素 bookstore。注释:假如路径起始于正斜杠( / ),则此路径始终代表到某元素的绝对路径!
bookstore/book  选取属于 bookstore 的直接子元素的所有 book 元素。
//book  选取所有 book 子元素,而不管它们在文档中的位置。
bookstore//book 选择属于 bookstore 元素的后代的所有 book 元素,而不管它们位于 bookstore 之下的什么位置。
bookstore//book/@lang   选取book元素的lang属性值。
bookstore//book[@class="book-css"]/title    选取class属性值为“book-css”的book元素的title。
//*[@class="bold"] 获取 class 值为 bold 的标签名
```

4. 读取xml文件

```
from lxml import etree

xml=etree.parse('data.xml')
titles=xml.xpath('//book/title')
authors=xml.xpath('//book/author')
prices=xml.xpath('//book/price')
for t in titles:
    print(t.text)
```

5. lxml读取html文件

```
from lxml import etree
//增加parser参数
parser=etree.HTMLParser(encoding='utf-8')
htmlelement=etree.parse('liepin.html',parser=parser)
html_string=etree.tostring(htmlelement,encoding='utf-8').decode(encoding='utg-8')
//读取innertext
links=htmlelement.xpath('//div/div/span/a')
for a in links:
    print(a.text)

//读取属性的值
with open('liepin.html','r',encoding='utf-8') as fp:
    content=fp.read()
    html=etree.HTML(content)
    links=html.xpath('//div/div/span/@title')
    for title in titles:
        print(title)
```

你可能感兴趣的:(笔记)