python抓链脚本程序

从百度贴吧抓取前若干页所有帖子的脚本

import re, string, urllib f = open("百度贴吧.html", "w+") def baidutieba(url,PostBegin,PostEnd): for i in range(PostBegin, PostEnd ,50): m = re.findall(r"f?kz=/d.+?",urllib.urlopen(url+str(i)).read(),re.M) for j in m: if j.find("onclick")==-1: line1 = j.split('"_blank">')[1] title = line1.split("")[0] line2 = j.split("=")[1] num = line2.split('"')[0] UrlNews = '' + title +'
/n' print UrlNews f.write(UrlNews) f.close() tiebaurl = 'http://tieba.baidu.com/f?kw=2012&pn=' iPostBegin = 0 iPostEnd = 500 baidutieba(tiebaurl,iPostBegin,iPostEnd)

你可能感兴趣的:(Python,脚本,python,百度,import,url,string)