Python爬虫学习(一)

其实node.js也可以做爬虫，相信前端的同学已经有所了解，但今天我们要讲的是如何用python，python实现起来其实更简单。

import urllib.request

url = "http://www.baidu.com"
response = urllib.request.urlopen(url).read()
data = data.decode('UTF-8')
print(data)   //data是html标签内容

urllib是python里面一个处理urls的库，可以看教程,这里简单介绍一下。

import urllib.request
with urllib.request.urlopen('http://www.python.org/') as f:
       print(f.read(300))

This example gets the python.org main page and displays the first 300 bytes of it.
上面的例子展示了读取该网页的前300个字节。
除了可以向urlopen方法里面传递网页地址，还可以构造一个request对象。

import urllib.request
DATA = b'some data'
// 构造一个request请求对象，包括请求地址，参数和请求方法
req = urllib.request.Request(url='http://localhost:8080', data=DATA,method='PUT')
with urllib.request.urlopen(req) as f:
    pass

我们来看看下面这段打印提示

a = urllib.request.urlopen(full_url)
type(a)

 
 a.geturl()
‘http://www.baidu.com/s?word=Jecvay’
 
a.info()

 
 a.getcode()
200

以上可以看出构造出的a都包含哪些内容。

如果要抓取百度上面搜索关键词kobe bryant的网页, 则代码如下

import urllib
import urllib.request
 
data={}
data['word']='kobe bryant'
 
url_values=urllib.parse.urlencode(data)
url="http://www.baidu.com/s?"
full_url=url+url_values
 
data=urllib.request.urlopen(full_url).read()
data=data.decode('UTF-8')
print(data)

我们看下urllib.parse这个模块的基本用法

urlencode函数，可以把key-value这样的键值对转换成我们想要的格式，返回的是a=1&b=2这样的字符串，比如：
from urllib import urlencode
 data = {
     'a': 'test',
     'name': '魔兽'
 }
 print urlencode(data)
a=test&name=%C4%A7%CA%DE

from urllib.parse import urlparse
    o = urlparse('http://www.cwi.nl:80/%7Eguido/Python.html')
    o   
    ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html',
            params='', query='', fragment='')
   o.scheme   'http'
   o.port    80
   o.geturl()    'http://www.cwi.nl:80/%7Eguido/Python.html'

在写代码之前我们需要先了解一下python中队列的知识。

from collections import deque
queue = deque(["Eric", "John", "Michael"])
queue.append("Terry")           # Terry 入队
queue.append("Graham")          # Graham 入队
queue.popleft()                 # 队首元素出队
#输出: 'Eric'
queue.popleft()                 # 队首元素出队
#输出: 'John'
queue                           # 队列中剩下的元素
#输出: deque(['Michael', 'Terry', 'Graham'])
List用来完成队列功能其实是低效率的, 因为List在队首使用 pop(0) 和 insert() 
都是效率比较低的, Python官方建议使用collection.deque来高效的完成队列任务

好了介绍完基本知识，就开始写代码了，毕竟我是一个coder...

import re
import urllib.request
import urllib
 
from collections import deque
 
queue = deque()
visited = set()
 
url = 'http://news.dbanotes.net'  # 入口页面, 可以换成别的
 
queue.append(url)
cnt = 0
 
while queue:
  url = queue.popleft()  # 队首元素出队
  visited |= {url}  # 标记为已访问
 
  print('已经抓取: ' + str(cnt) + '   正在抓取 <---  ' + url)
  cnt += 1
  urlop = urllib.request.urlopen(url)
  if 'html' not in urlop.getheader('Content-Type'):
    continue
 
  # 避免程序异常中止, 用try..catch处理异常
  try:
    data = urlop.read().decode('utf-8')
  except:
    continue
 
  # 正则表达式提取页面中所有队列, 并判断是否已经访问过, 然后加入待爬队列
  linkre = re.compile('href="(.+?)"')
  for x in linkre.findall(data):
    if 'http' in x and x not in visited:
      queue.append(x)
      print('加入队列 --->  ' + x)

上面的一个例子，是先爬取百度首页，获取页面带有href的a标签，然后放入队列，继续爬取该地值得内容，直到队列里面没有元素为止。
显然这还不是我们想要的结果，咱们继续来看

// content是爬取到的内容
content = response.read().decode('utf-8')
pattern = re.compile('.*?.*?(.*?).*?(.*?)

import urllib import urllib2 import re page = 1 url = 'http://www.qiushibaike.com/hot/page/' + str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8') pattern = re.compile('.*?.*?(.*?).*?(.*?)

__author__ = 'CQC' # -*- coding:utf-8 -*- import urllib import urllib2 import re import thread import time #糗事百科爬虫类 class QSBK: #初始化方法，定义一些变量 def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' #初始化headers self.headers = { 'User-Agent' : self.user_agent } #存放段子的变量，每一个元素是每一页的段子们 self.stories = [] #存放程序是否继续运行的变量 self.enable = False #传入某一页的索引获得页面代码 def getPage(self,pageIndex): try: url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex) #构建请求的request request = urllib2.Request(url,headers = self.headers) #利用urlopen获取页面代码 response = urllib2.urlopen(request) #将页面转化为UTF-8编码 pageCode = response.read().decode('utf-8') return pageCode except urllib2.URLError, e: if hasattr(e,"reason"): print u"连接糗事百科失败,错误原因",e.reason return None #传入某一页代码，返回本页不带图片的段子列表 def getPageItems(self,pageIndex): pageCode = self.getPage(pageIndex) if not pageCode: print "页面加载失败...." return None pattern = re.compile('.*?(.*?).*?(.*?).*?

假设有这样一段html代码 html_doc = """ The Dormouse's story The Dormouse's story Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well. ... """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) soup.title # The Dormouse's story soup.title.name # u'title' 相当于是获得标签名 soup.title.string # u'The Dormouse's story' 获取的是标签里的文字内容 soup.title.parent.name # u'head' 熟悉jquery的同学一看就懂 soup.p # The Dormouse's story 这种写法其实获得是第一个p元素 soup.p['class'] # u'title' 获取第一个p的class属性，如果有多个class，返回list soup.find_all('a') [Elsie, Lacie, Tillie] 这里是查找html标签里面所有的a元素，返回的是一个数组 soup.find(id="link3") # Tillie 好吧，这不是jquery里面的属性选择器吗 soup.find(id="link3").get('href') # http://example.com/tillie

html_doc = """ The Dormouse's story The Dormouse's story The Dormouse's story Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well. ... """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc) head_tag = soup.head head_tag # The Dormouse's story head_tag.contents [The Dormouse's story,The Dormouse's story ] 获取某个元素的子元素，取某个可以以下标的方式 title_tag = soup.title title_tag.parent # The Dormouse's story The Dormouse's story link = soup.a link # Elsie for parent in link.parents: if parent is None: print(parent) else: print(parent.name) # p # body # html # [document] # None 通过元素的 .parents 属性可以递归得到元素的所有父辈节点,下面的例子使用了 .parents 方法遍历了标签到根节点的所有节点. 讲到兄弟节点可能和jquery不大一样，分的特别细。 sibling_soup = BeautifulSoup("text1text2") sibling_soup.b.next_sibling # text2 sibling_soup.c.previous_sibling # text1 当然我们也可通过 .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出: for sibling in soup.a.next_siblings: print(repr(sibling)) # u',\n' # Lacie # u' and\n' # Tillie # u'; and they lived at the bottom of a well.' # None 需要注意的是兄弟节点也包括换行符和字符串等。当然也可以使用正则来查找： import re for tag in soup.find_all(re.compile("^b")): print(tag.name) # body # b soup.find_all("p", "title") # [The Dormouse's story]

Python爬虫学习(一)

你可能感兴趣的:(Python爬虫学习(一))