起点小说下载工具 代码汇

起点现在的双域名分别是不同的页面架构技术:
cmfu是原来老的ASP,对于公开章节,起点采用的是js调用一个txt,相对原来直接页面显示,确实是一个明智的选择(对抓取的来说也方便:p).
qidian采用的是新的.Net技术,这个里面就需要采用正则去匹配了.

结合上一篇文章我的小程序,这里将社区内相关的代码全部show一下.
首先是我的:

#!/usr/bin/python
#filename:simpleCMFU
import re
import urllib

def alaynsis_id(url_given):
name=url_given[-11:-1]+url_given[-1]
return name


def read(url_given):
html=urllib.urlopen(url_given)
page=html.read()
html.close()
#rex = r'http:\/\/files\.qidian\.com\/\[a-zA-Z]{6}[0-9]\/d{6}\/\d{6}\.txt'
rex = r'http://files.qidian.com/author[0-9]/\d{6}/\d{7}.txt'
#http://files.qidian.com/author3/172602/4451850.txt
url_down=test(page,rex)
url=url_down[0]
#print rex
return url

def read2(url,name):
html=urllib.urlopen(url)
page=html.read()
html.close()
page=page[15:len(page)]
fl=file(name,'w')
fl.write(page)
fl.close()
return 'ok'

def test(html,rex):
#r = re.compile(rex)
matchs = re.findall(rex,html,re.DOTALL)
return matchs

def run():
url=raw_input('please send address you wanted:')
url_tmp=read(url)
name=alaynsis_id(url_tmp)
read2(url_tmp,name)

if __name__ == '__main__':
print 'this program is just for download text from qidian.com by [email protected]\please visit http://duducai.iteye.com \n'
run()



其次是社区Ben Luo的大作,走的是sina读书频道:

#####################
#html2txt.py
#####################

from formatter import AbstractFormatter, NullWriter
from htmllib import HTMLParser

def _(str, in_encoder="gbk", out_encoder="utf8"):
return unicode(str, in_encoder).encode(out_encoder)


class myWriter(NullWriter):
def __init__(self):
NullWriter.__init__(self)
self._bodyText = []

def send_flowing_data(self, str):
self._bodyText.append(str)

def _get_bodyText(self):
return '\n'.join(self._bodyText)

bodyText = property(_get_bodyText, None, None, 'plain text from body')

class myHTMLParser(HTMLParser):
def do_meta(self, attrs):
self.metas = attrs

def convertFile(filename):
mywriter = myWriter()
absformatter = AbstractFormatter(mywriter)
parser = myHTMLParser(absformatter)
parser.feed(open(filename).read())
return ( _(parser.title), parser.formatter.writer.bodyText )

import os
import os.path

OUTPUTDIR = "./txt"
INPUTDIR = "."
if __name__ == "__main__":
if not os.path.exists(OUTPUTDIR):
os.mkdir(OUTPUTDIR)

for file in os.listdir(INPUTDIR):
if file[-4:] == '.htm' or file[-5:] == '.html':
print "Coverting", file,
outfilename = os.path.splitext(file)[0]
a, text = convertFile(file)
outfilename = outfilename + '.txt'
outfullname = os.path.join(OUTPUTDIR, outfilename)
open(outfullname, "wt").write(text)
print "Done!"




################################
#pickupcontent.py
################################

# -*- coding: utf-8 -*-

import sys
import glob
import os
import re

sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)]
startstr = u"^八十".encode("gb2312") # article title
endstr = u"^\[返回".encode("gb2312") #
tmp_start = re.compile(startstr)
tmp_end = re.compile(endstr)
for infile in sys.argv[1:]:
# print infile
f = open(infile,'r')
#print f
lines = f.readlines()
fout = ''
for index, line in enumerate(lines):
if tmp_start.match(line):
kstart = index
if tmp_end.match(line):
kend = index
break

f.close()
fout = fout.join(lines[kstart:kend])
tmp = open('tmp','w')
tmp.write(fout)
tmp.close()
os.remove(infile)
os.rename('tmp',infile)



最后是BIGZHU的:

#@+leo-ver=4-thin-encoding=gb2312,.
#@+node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
#@+at
#@nonl
# 起点小说爬虫
#@-at
#@@c
#@@language python
#@+others
#@+node:BIGZHU.20070731161308:import
import httplib,urllib2,urllib,cookielib,re,threading
import os
#@nonl
#@-node:BIGZHU.20070731161308:import
#@+node:BIGZHU.20070731160928:getCookie
def getCookie():
cj = cookielib.CookieJar()#建立Cookie实例
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#建立opener与Cookie关联
return opener
#@-node:BIGZHU.20070731160928:getCookie
#@-others
#@<>
#@+node:BIGZHU.20070731160918.1:<>
def getBookIdList(urlList):

BookIdList = []
for i in urlList:
url=i
#print url
request = urllib2.urlopen(url)
cmfu = request.read()
#cmfuURL = re.findall(" #BookIdListTemp = [re.sub(" #BookIdListTemp = [re.sub("'",'',k) for k in BookIdListTemp]
#起点的代码太不规范了,想一个更广泛性的匹配正则表达式
"""
cmfuURL = re.findall("showbook.asp\?bl_id=\d{1,}",cmfu)
BookIdListTemp = [re.sub("showbook.asp\?bl_id=",'',k) for k in cmfuURL]
"""
#更大众化一些
cmfuURL = re.findall("bl_id=\d{1,}",cmfu)
BookIdListTemp = [re.sub("bl_id=",'',k) for k in cmfuURL]
#BookIdListTemp = [ re.sub("'",'',k) for k in BookIdListTemp]
bookCount = len(BookIdList)
for listTemp in BookIdListTemp:
#检查该bookid是否在BookIdList中已有
if listTemp in BookIdList:
pass
else:
BookIdList.extend([listTemp])#加进去
print "取得书本数目:%i"%(len(BookIdList)-bookCount)
print "合计取得下载书本:%i"%len(BookIdList)
return BookIdList

#@-node:BIGZHU.20070731160918.1:<>
#@nl
#@<>
#@+node:BIGZHU.20070731164705:<>
def getBookName(opener,bookId=''):
if bookId == '':
print "传入BookIdList是空的"
bookURL = 'http://www.cmfu.com/readbook.asp?bl_id=%s'%bookId
request = urllib2.Request(bookURL)
bookPage = opener.open(request).read()
opener.close()
bookname = re.findall('bookname=\S{1,}',bookPage)

bookname = [re.sub("bookname=",'',k) for k in bookname]
bookname = [re.sub('"','',k) for k in bookname][0]

return bookname

#@-node:BIGZHU.20070731164705:<>
#@nl
#@<>
#@+node: BIGZHU.20070731171721:<>
def getTextFile(opener,bookId):
bookName = getBookName(opener,bookId)
#判断文件是否已经存在
if os.path.isfile(os.getcwd()+"\\起点\\%s.txt"%bookName):
print "%s 已经存在"%bookName
else:
url = 'http://download.cmfu.com/pda/%s.txt'%bookId
try:
bookData = opener.open(url).read()
except :
print "2 %s"%bookName
try:
bookData = opener.open(url).read()
except :
print "last try %s"%bookName
try:
bookData = opener.open(url).read()
except :
print "end try %s"%bookName

opener.close()

f=open(os.getcwd()+"\\起点\\%s.txt"%bookName,"wb")
f.write(bookData)
f.close()
print 'get book %s 完毕'%bookName
#@-node:BIGZHU.20070731171721:<>
#@nl
#@<>
#@+node:BIGZHU.20070801172939:<>
class runGetFile(threading.Thread):
def __init__(self,bookId):
threading.Thread.__init__(self)
self.bookId = bookId
#self.opener = opener
def run(self):
opener = getCookie()
getTextFile(opener,self.bookId)
#@nonl
#@-node: BIGZHU.20070801172939:<>
#@nl
#@<>
#@+node:BIGZHU.20070802171013:<>
class ProcessURL:
"""对新输入url,save 到ini中
对已有url,忽视
每次使用,自动读取ini的url,提供使用"""
def __init__(self):
pass
#@ <>
#@+node:BIGZHU.20070802171013.1:<>
def saveURL(self,urlList=[]):
'''存储新的url到URL.ini中'''


try:
f=open(os.getcwd()+"\\起点\\URL.ini","wb")#追加内容
except IOError:
print "文件打开错误"
#格式化成字符串
s_urlList = ";".join(urlList)
f.write(s_urlList)
f.close()
#@-node:BIGZHU.20070802171013.1:<>
#@nl
#@ <>
#@+node:BIGZHU.20070802171013.2:<>
def getURLIni(self):
"""读取 URL.ini中的url
返回一个URL list"""
#判断目录是否存在
if os.path.exists (os.getcwd()+"\\起点"):
pass
else:
print "创建目录 \起点"
os.mkdir("起点")

iniData=''
if os.path.isfile(os.getcwd ()+"\\起点\\URL.ini"):
f=open(os.getcwd()+"\\起点\\URL.ini","rb")
iniData = f.read()
f.close()
else:
print "URL.txt不存在,创建之"
f=open(os.getcwd()+"\\起点\\URL.ini","wb")
#iniData = f.read()
f.close()
return iniData.split(";")#格式化成list
#@-node:BIGZHU.20070802171013.2: <>
#@nl


#@-node:BIGZHU.20070802171013:<>
#@nl
#@<
>
#@+node:BIGZHU.20070731164705.1:<
>
if __name__ == '__main__':
opener = getCookie()
#urlList =["http://www.cmfu.com/index.asp"," http://www.cmfu.com/listbookqb.asp?pageid=2007-8-1%2012:26&status=down","http://www.cmfu.com/listbookqb.asp?pageid=2007-7-31%2023:03&status=down ","http://www.cmfu.com/index_wxxx.asp"]
#存放和读取url
urlType = ProcessURL()
urlList = urlType.getURLIni()
saveIni = 0 # 标识是否有url 更新
while True:
url = raw_input("要截取的起点的某个页面: ")
if url=='':
break
if url in urlList:
print "%s 已有,忽视之"%url
else:
urlList.extend([url])
print "%s 是新的,添加之"%url
saveIni =1
#url = 'http://www.cmfu.com/index.asp'


bookIdList=getBookIdList(urlList)


for i in bookIdList:
thread = runGetFile(i)
thread.start()
#存储到ini中
if saveIni == 1:
urlType.saveURL(urlList)
#@-node:BIGZHU.20070731164705.1:<
>
#@nl
#@nonl
#@-node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py
#@-leo

你可能感兴趣的:(美丽的Python,OS,Python,ASP,ASP.net,F#)