只抓第一页没有重复抓取验证、没有sleep ; 哈哈想抓更多调用call_me就好了(排版有点淡淡的疼)
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from lxml import etree
import sys
import re
import urllib
import urllib2
import json
import hashlib
import random
import time
import os
import cookielib
import writetemp
#抓取调用函数
def call_me(url):
if(url == None):
print 'did\'t url'
try:
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
p = opener.open(url)
if p.getcode() != 200:
sys.exit()
html = p.read()
blog_list_url = []
blogs_info = []
test = pq(html)
blog_list = test('#post_list a.titlelnk')
for m in blog_list:
blog_list_url.append(pq(m).attr('href'))
blog_list_url = sorted(set(blog_list_url),key=blog_list_url.index)
for m_url in blog_list_url:
try:
new_p = opener.open(m_url)
if new_p.getcode() != 200:
sys.exit()
m_html = new_p.read()
db_blog = pq(m_html)
m_class = []
info = {
'title': db_blog('#cb_post_title_url').text(),
'time': db_blog('#post-date').text(),
'info':db_blog('#cnblogs_post_body').html(),
'link': m_url
}
print info
blogs_info.append(info)
except:
continue
writetemp.write_temp(blogs_info)
except Exception,e:
print e
finally:
print 'original page done'
#demo
#url = 'http://www.cnblogs.com/'
#call_me(url)