python入门(2)-目录文件列举和Beautiful Soup简单解析

功能:

  1.列举一个目录下的文件

  2.利用BeautifulSoup简单解析正文内容,然后保存

待完善:

  1.多线程支持

  2.适配器支持(for雷锋网和36氪两个网站网页)

"""

parser

    for parsing html file from leiphone.com and 36kr.com

    contact xiaoyang

"""



#

# @author:  xiaoyang

# @contact: [email protected]

# @version:

# @describ: parse a html file from leiphone.com

# @log:

#           1.2012-11-22 create

#           2.2012-11-23 add FileCollect and ParseTask class

#



import sys

import urllib2

import codecs

import os

from bs4 import BeautifulSoup



# global def

OUT_FILE_PREFIX = "out"

OUT_CNT = 0



#

FileCollectDBG=False

ParseTaskDbg=True



def errPrint(code, msg=''):

    print >> sys.stderr, __doc__ % globals()

    if msg:

        print >> sys.stderr, msg

        sys.exit(code)



# for LeiPhone.com

def SaveResLP(doc,filename):

    print "!LOCK!"

    fp=None

    try:

        fp=open(filename,"w")

        fp.write(doc)

    except IOError as errStr:

        errPrint(1, errStr)

    finally:

        fp.close()

        print "!UNLOCK!"

    return True



# foe 36kr.com

def SaveRes36K(doc,filename):

   print "!LOCK!"

   print "!UNLOCK!"

   return True



class FileCollect:

	def __init__(self, root):

		self.root = root

		self.dlist = []

		self.flist = []

	def init(self):

		for root, dirs, files in os.walk(self.root):

			self.dlist += dirs

			for afile in files: 

				self.flist.append(root + afile)

		return True



class ParseTask:

    def __init__(self, savedFileName):

		self.soup = None

		self.savedCnt = 0

		self.doneCnt = 0

		self.savedFileName = savedFileName

    def parse(self, readFileName):

        fp = None

        content = None

        try:

            fp = open(readFileName, "r")

            if fp is not None:

                self.soup = BeautifulSoup(fp.read())

            else:

                errPrint(1, "fopen failed!")

            content=self.soup.find_all(id="content_main")

            self.doneCnt=self.doneCnt+1

           

            if self.doneCnt >= self.savedCnt:

                SaveResLP(str(content[0]),self.savedFileName)

                self.doneCnt=0

        except IOError as errStr:

			errPrint(1, errStr)

        finally:

			if fp is not None:

				fp.close()



if FileCollectDBG:		

    fc = FileCollect("/opt/project/")

    fc.init()

    print "dlist:\r\n", fc.dlist

    print "flist:\r\n", fc.flist

elif ParseTaskDbg:

    newTask=ParseTask("out.html")

    newTask.parse("1119-vv-dolby.html")

    print "saved OK!\r\n"

 

你可能感兴趣的:(python)