http://blog.sina.com.cn/s/blog_6266e57b010128l4.html
序,引子
def updatePostsDB(request):
#deleteAll()
SiteInfos=[]
SiteInfo={}
SiteInfo['PostSite']="L2ZStory"
SiteInfo['feedurl']="feed://l2zstory.wordpress.com/feed/"
SiteInfo['blog_type']="wordpress"
SiteInfos.append(SiteInfo)
SiteInfo={}
SiteInfo['PostSite']="YukiLife"
SiteInfo['feedurl']="feed://blog.sina.com.cn/rss/1583902832.xml"
SiteInfo['blog_type']="sina"
SiteInfos.append(SiteInfo)
SiteInfo={}
SiteInfo['PostSite']="ZLife"
SiteInfo['feedurl']="feed://ireallife.wordpress.com/feed/"
SiteInfo['blog_type']="wordpress"
SiteInfos.append(SiteInfo)
SiteInfo={}
SiteInfo['PostSite']="ZLife_Sina"
SiteInfo['feedurl']="feed://blog.sina.com.cn/rss/1650910587.xml"
SiteInfo['blog_type']="sina"
SiteInfos.append(SiteInfo)
try:
for site in SiteInfos:
feedurl=site['feedurl']
blog_type=site['blog_type']
PostSite=site['PostSite']
PostInfos=getPostInfosFromWeb(feedurl,blog_type)
recordToDB(PostSite,PostInfos)
Msg="Cron Job Done..."
except Exception,e:
Msg=str(e)
cron:
- description: retrieve newest posts
url: /task_updatePosts/
from appengine_django.models import BaseModel
from google.appengine.ext import db
classPostsDB(BaseModel):
link=db.LinkProperty()
title=db.StringProperty()
author=db.StringProperty()
date=db.DateTimeProperty()
description=db.TextProperty()
import urllib
#from BeautifulSoup import BeautifulSoup
from pyquery import PyQuery as pq
def getArticleList(url):
lstArticles=[]
url_prefix=url[:-6]
Cnt=1
response=urllib.urlopen(url)
html=response.read()
d=pq(html)
try:
pageCnt=d("ul.SG_pages").find('span')
pageCnt=int(d(pageCnt).text()[1:-1])
except:
pageCnt=1
for i in range(1,pageCnt+1):
url=url_prefix+str(i)+".html"
#print url
response=urllib.urlopen(url)
html=response.read()
d=pq(html)
title_spans=d(".atc_title").find('a')
date_spans=d('.atc_tm')
for j in range(0,len(title_spans)):
titleObj=title_spans[j]
dateObj=date_spans[j]
article={}
article['link']= d(titleObj).attr('href')
article['title']= d(titleObj).text()
article['date']=d(dateObj).text()
article['desc']=getPageContent(article['link'])
lstArticles.append(article)
return lstArticles
def getPageContent(url):
#get Page Content
response=urllib.urlopen(url)
html=response.read()
d=pq(html)
pageContent=d("div.articalContent").text()
#print pageContent
return pageContent
def main():
url='http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html'#Han Han
url="http://blog.sina.com.cn/s/articlelist_1225833283_0_1.html"#Gu Du Chuan Ling
url="http://blog.sina.com.cn/s/articlelist_1650910587_0_1.html"#Feng
url="http://blog.sina.com.cn/s/articlelist_1583902832_0_1.html"#Yuki
lstArticles=getArticleList(url)
for article in lstArticles:
f=open("blogs/"+article['date']+"_"+article['title']+".txt",'w')
f.write(article['desc'].encode('utf-8')) #特别注意对中文的处理
f.close()
#print article['desc']
if __name__=='__main__':