使用包 : lxml,requests,urllib2
起始url :人民日报主页
爬取目标 :人民日报要闻
输出格式: HTML表格文件
思路 : 首先收集要爬取页面的所有链接,之后逐个进行爬取
#-*-coding:utf8-*-
#这段代码写得不是很好,许多地方都有要改善的地方,大神勿喷^-^
import requests
import urllib2
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import HTMLParser
def htmls(url):
url = url.replace(" ", "")
request = urllib2.Request(url)
head = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
try:
response2 = urllib2.urlopen(request)
html = response2.read()
#html = unicode(html, "gb2312").encode("utf-8").decode('utf-8')
selector = etree.HTML(html)
return selector
except urllib2.HTTPError, e:
return
def firsthtml(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
html = requests.get(url, headers=header)
selector = etree.HTML(html.text)
return selector
def urls(url):
selector = firsthtml(url)
content_field1 = selector.xpath('/html/body/section[5]/div[2]/ul/li/strong/a')
content_field2 = selector.xpath('/html/body/section[5]/div[2]/ul/li/a')
content = content_field1 + content_field2
urlss = []
for content in content:
urlss.append(content.attrib['href'])
return urlss
def spider(url): #url处理函数
print '正在处理衔接'+str(num)+": ", url
selector = htmls(url)
if selector is None:
print '该链接未找到 -_-'
return
temp = {}
try:
title_path = selector.xpath('/html/body/div[4]/h1')
content_path = selector.xpath('//*[@id="rwb_zw"]/p')
time_path = selector.xpath('/html/body/div[4]/div/div[1]')
source_path = selector.xpath('/html/body/div[4]/div/div[1]/a')
temp['time'] = time_path[0].text[0:19]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
title_path = selector.xpath('/html/body/div[@class="pic_content clearfix"]/div[@class="title"]/h1')
content_path = selector.xpath('/html/body/div[@class="content clear clearfix"]/p')
source_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/a')
time_path = selector.xpath('//*[@id="picG"]/div[2]/div[2]/text()[2]')
try:
temp['time'] = time_path[0][0:23]
temp['source'] = source_path[0].text
temp['title'] = title_path[0].text
except:
print '该链接爬取失败 -_-'
return
scontent = ''
for content in content_path:
scontent = scontent + content.text
temp['content'] = scontent
temp['url'] = url
all.append(temp)
print "成功爬取该链接 ^.^"
def tohtml(datas):
fout = open('content.html', 'w')
fout.write("")
fout.write("")
fout.write("人民日报要闻 ")
fout.write("")
fout.write("")
fout.write("")
for data in datas:
fout.write("")
fout.write("%s " % data['url'])
fout.write("%s " % data['title'].encode('utf-8'))
fout.write("%s " % data['time'].encode('utf-8'))
fout.write("%s " % data['source'].encode('utf-8'))
fout.write("%s " % data['content'].encode('utf-8'))
fout.write(" ")
fout.write("
")
fout.write("")
fout.write("")
fout.close()
if __name__ == '__main__':
num = 1
all = []
urlss = urls('http://www.people.com.cn/')
# pool = ThreadPool(4)
for x in urlss:
spider(x)
num = num + 1
# results = pool.map(spider, urlss)
tohtml(all)
# pool.close()
# pool.join()
#本来想开个多线程的,懒得写了,大家有兴趣可以自己尝试下,也不难 ^-^
我是一条小小的分割线,我还是第一条 ^-^