本文是练手Demo,主要是使用 Beautiful Soup
来爬取网页数据。
Beautiful Soup提供一些简单的、python式的用来处理导航、搜索、修改分析树等功能。
sudo easy_install pip
sudo pip install beautifulsoup4
本示例是抓取一个靠谱的真诚透明的互联网金融公司的投资列表页面【点我访问网页】,页面如下图:
本示例是获取项目列表,打开Chrome的调试栏,找到对应的位置,如下图:
import sys
import json
import urllib2 as HttpUtils
import urllib as UrlUtils
from bs4 import BeautifulSoup
def gethtml(page):
'获取指定页码的网页数据'
url = 'https://box.jimu.com/Project/List'
values = {
'category': '',
'rate': '',
'range': '',
'page': page
}
data = UrlUtils.urlencode(values)
# 使用 DebugLog
httphandler = HttpUtils.HTTPHandler(debuglevel=1)
httpshandler = HttpUtils.HTTPSHandler(debuglevel=1)
opener = HttpUtils.build_opener(httphandler, httpshandler)
HttpUtils.install_opener(opener)
request = HttpUtils.Request(url + '?' + data)
request.get_method = lambda: 'GET'
try:
response = HttpUtils.urlopen(request, timeout=10)
except HttpUtils.URLError, err:
if hasattr(err, 'code'):
print err.code
if hasattr(err, 'reason'):
print err.reason
return None
else:
print '====== Http request OK ======'
return response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
# items 是一个 对象,不是一个list,但是可以循环遍历所有子节点。
items = soup.find(attrs={'class':'row'}).children
projectList = []
for item in items:
if item == '\n': continue
# 获取需要的数据
title = item.find(attrs={'class': 'title'}).string.strip()
projectId = item.find(attrs={'class': 'subtitle'}).string.strip()
projectType = item.find(attrs={'class': 'invest-item-subtitle'}).span.string
percent = item.find(attrs={'class': 'percent'})
state = 'Open'
if percent is None: # 融资已完成
percent = '100%'
state = 'Finished'
totalAmount = item.find(attrs={'class': 'project-info'}).span.string.strip()
investedAmount = totalAmount
else:
percent = percent.string.strip()
state = 'Open'
decimalList = item.find(attrs={'class': 'decimal-wrap'}).find_all(attrs={'class': 'decimal'})
totalAmount = decimalList[0].string
investedAmount = decimalList[1].string
investState = item.find(attrs={'class': 'invest-item-type'})
if investState != None:
state = investState.string
profitSpan = item.find(attrs={'class': 'invest-item-rate'}).find(attrs={'class': 'invest-item-profit'})
profit1 = profitSpan.next.strip()
profit2 = profitSpan.em.string.strip()
profit = profit1 + profit2
term = item.find(attrs={'class': 'invest-item-maturity'}).find(attrs={'class': 'invest-item-profit'}).string.strip()
project = {
'title': title,
'projectId': projectId,
'type': projectType,
'percent': percent,
'totalAmount': totalAmount,
'investedAmount': investedAmount,
'profit': profit,
'term': term,
'state': state
}
projectList.append(project)