爬虫===urllib2

1、获取域名注册信息

  #-*- coding:utf-8 -*-
  import whois
  print(whois.whois('baidu.com'))

urllib2库

#下载指定网站信息的内容
def download(url,user_agent='Mozilla/5.0',num_retries=3,proxy=None):
    print('downloading!!!')
    content = None
    # 伪造请求头
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
    request = urllib2.Request(url,headers=headers)
   try:
        #读取指定url地址的内容
        # content = urllib2.urlopen(url).read()
        content = urllib2.urlopen(request).read()

      #启用代理IP
         if proxy:
              handler = urllib2.ProxyHandler(proxies={'http':'http://%(host)s:%(port)d'%proxy})
              opener = urllib2.build_opener(handler)
              urllib2.install_opener(opener)

    #读取指定url地址的内容
          # content = urllib2.urlopen(url).read()
          content = urllib2.urlopen(request).read()

    except Exception as e:
        print e
        #判断重试次数不为0
        if num_retries > 0:
            #错误为服务器错误
            if hasattr(e,'code') and 500 <= e.code < 600:
                #递归调用,减少重试次数
                return download(url,user_agent,num_retries-1)
    return content
#*****************测试**********************
# print download('https://www.baidu.com/')
# print download('https://www.taobao.com/')
# print download('https://www.vip.com/')
print download('http://www.xicidaili.com/')

请求头:有多种作用:伪装成浏览器
代理IP的重要性: HTTP代理ip就很重要,如果当前的ip地址受限制,可以换一个新的ip地址,保证爬虫的顺利进行

robot协议

查看网站robot协议:域名+robots.txt ===>https://www.taobao.com/robots.txt

  #-*- coding:utf-8 -*-
  import robotparser
  #尝试baidu或taobao的robots.txt协议
  rp = robotparser.RobotFileParser()
  rp.set_url('https://www.taobao.com/robots.txt')
  rp.read()
  print rp.can_fetch(useragent='Baiduspider',url='https://www.taobao.com/article')

网站地图

requests使用

  import requests
  import re
  #获取请求信息
  text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
  #匹配符合条件的内容
  result = re.findall('',text)
  print result[3][2]

提取方式:

BeautifulSoup(美丽汤)

from bs4 import BeautifulSoup
import requests

text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
bs = BeautifulSoup(text,'html.parser')
bs.prettify()

# tr = bs.find('tr',attrs={'id': 'places_phone__row'})
#提取符合要求的内容
# print tr.find('td',attrs={'class': 'w2p_fw'}).text

#提取符合要求的图片
tr = bs.find('tr',attrs={'id': 'places_national_flag__row'})
# print tr.find('td',attrs={'class': 'w2p_fw'})
s = tr.find('img').attrs['src']
print s

lxml提取

  import requests
  import lxml.html

  text = requests.get('http://example.webscraping.com/places/default/view/Afghanistan-1').text
  doc = lxml.html.fromstring(text)
  print type(text)         #
  text = lxml.html.tostring(doc,pretty_print=True)
  print type(doc )       #
  tree = lxml.html.fromstring(text)
  t =  tree.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
  # t =  doc.cssselect('tr#places_country__row > td.w2p_fw')[0].text_content()
  print t,tree.get('class')

lxml -- xpath

  #-*- coding:utf-8 -*-
  from lxml import etree

  with open('hello.html','r') as f:
      html = f.read()


  html = etree.HTML(html)
  html = etree.tostring(html,pretty_print=True)
  # print html
  html = etree.HTML(html)
  # html = etree.parse('hello.html')
  # print type(html)
  # #获取所有的 
  • 标签 result = html.xpath('//li') # print result # print type(result) # print type(result[0]) # print len(result) --- #获取所有li标签的所有class # print html.xpath('//li/@class') --- #获取
  • 标签下 href 为 link1.html 的 标签 print html.xpath('//li//a[@href="link1.html"]') #获取所有a的href print html.xpath('//li//a/@href') # print html.xpath('//li//a/text()') ---
  • 你可能感兴趣的:(爬虫===urllib2)