centos 7下python2.7实现网页爬虫
使用requests、BeautifulSoup
- 安装
方法一:
下载:http://www.crummy.com/software/BeautifulSoup/bs4/download/4.2/
解压:tar -xzvf beautifulsoup4-4.2.0.tar.gz
安装:进入解压后的目录
python setup.py build
sudo python setup.py install
方法二(快速安装)
(Ubuntu) sudo apt-get install python-bs4
或者
install beautifulsoup4
或着
easy_install beautifulsoup4
2.使用
import requests
from bs4 import BeautifulSoup
3.实例
import requests
from bs4 import BeautifulSoup
class Grab(object):
def tzuqiu_parse_url_to_html(self,url,page):
response=requests.get(url+'?page='+page)
soup=BeautifulSoup(response.content, "html.parser")
body=soup.find_all(class_='new-item')
for div in body:
img=div.find_all(class_='new-img')[0].contents[1].get('src')
title=div.find_all(class_='new-title')[0]
content=div.find_all(class_='limittext-multirow')[0]
tags=div.find_all(class_='new-tag limittext')[0]
print('img:'+str(img))
print('title:'+(title.string).encode('utf-8').strip())
print('content:'+(content.string).encode('utf-8').strip())
print('time:'+(tags.find_all('span')[0].string).encode('utf-8').strip())
print('tag:'+(tags.find_all('span')[1].string).encode('utf-8').strip())
def tzuqiu_next_page(self,url,page):
self.tzuqiu_parse_url_to_html(url,page)
if name=='main':
grab=Grab()
for i in range(2):
grab.tzuqiu_next_page('http://www.tzuqiu.cc/news/list.do',str(i))
4.执行
python grab.py