python html转TXT python读取html指定区域文本内容转成txt文件

1、首先,通过python,去将读取遍历程序目录文件夹中,【html】文件夹里面的文件、文件

夹以及子目录、子目录里面的 ,获取到该目录下所有的【.html】文件后,返回一个list对象

2、遍历完成后得到一个html文件列表对象,将该列表交给html_to_txt方法,html_to_txt方法

里面循环逐个读取html文件中指定标签中

标签中

标签中的文字,和中指定标签

里面

标签的文字提取出来

3、读取到的文本内容输出到txt文件中,这里可以加上一个替换replace,把我们不需要的内

容替换之后,这里可以做多次替换,也可以加上换行之类的处理,再进行输出,可根据自己

的需求修改,如果有什么不明之处,可以提问

资源

main.py


```python
import glob
import os
import re
import pypandoc
from selectolax.parser import HTMLParser
from html.parser import HTMLParser

from lxml import etree

from Html_To_txt import html_to_txt


# 解析本地html,返回字典数据类型
def parse_html(file_path, vla=None):
    for ff in file_path:
        val = []
        with open(ff, 'r', encoding='gbk') as f:
            html = etree.HTML(f.read())
        Title = html.xpath("//*[@id='left']/div/div[2]/h1/text()")

        contents = html.xpath("//td[@class='info_content']/*")
        val.append(Title)

        for td in contents:
            val.append(td.text)
            a = html_for(td.xpath("./strong/a"))

            b = html_for(td.xpath("./strong/tail()"))
            vla.append(a.text)
            vla.append(b.text)
        txt = open(os.getcwd() + "\\txt\\" + ff.split('\\')[-1], 'w', encoding="utf-8")
        txt.write(val)
        txt.close()

        # res = {}
        # for div in divs:
        #     key = div.xpath("./span[1]/text()")[0].replace('/', '_')
        #     value = div.xpath("./span[2]/text()")[0]
        #     res[key] = value


def html_for(html_obj):
    aaa_ls = []
    if len(html_obj) > 2:
        for bbb in html_obj:
            if len(bbb) > 2:
                aaa_ls.append(html_for(bbb).text)
            else:
                aaa_ls.append(bbb.text)
    else:
        a = html_obj[0].text
        b = html_obj[0].tail
        aaa_ls.append(str(html_obj[0].text + html_obj[0].tail))
    return aaa_ls


def search_dir(pathstr, file_all=[]):
    files = os.listdir(pathstr)  # 得到文件夹下的所有文件名称
    # print(files)
    for file_str in files:  # 遍历该文件夹
        if os.path.isdir(pathstr + "\\" + file_str):  # 是子文件夹
            search_dir(pathstr + "\\" + file_str)
        else:  # 是文件

            if os.path.splitext(file_str)[1] == '.html':
                print(pathstr + "\\" + file_str)
                file_all.append(pathstr + "\\" + file_str)
    return file_all


# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
    path = os.getcwd() + r'\html'
    aaa = search_dir(path)
    html_to_txt(aaa)


Html_To_txt.py


import os

from lxml import etree

from sgmllib import SGMLParser


class GetIdList(SGMLParser):
    # def __init__(self, verbose=0):
    #     super().__init__(verbose)
    #     self.verbatim = 0
    #     self.getdata = False
    #     self.flag = False
    #     self.IDlist = []

    def reset(self):
        self.IDlist = []
        self.flag = False
        self.getdata = False
        self.verbatim = 0
        SGMLParser.reset(self)

    # def start_div(self, attrs):
    #     if self.flag:
    #         self.verbatim += 1  # 进入子层div了,层数加1
    #         return
    #     for k, v in attrs:  # 遍历div的所有属性以及其值
    #         if k == 'class' and v == 'entry-content':  # 确定进入了
# self.flag = True # return # # def end_div(self): # 遇到
# if self.verbatim == 0: # self.flag = False # if self.flag: # 退出子层div了,层数减1 # self.verbatim -= 1 def start_div(self, attrs): if self.flag: self.verbatim += 1 # 进入子层td了,层数加1 return for k, v in attrs: # 遍历div的所有属性以及其值 if k == 'class' and v == 'article-content': # 确定进入了
self.flag = True return def end_div(self): # 遇到 if self.verbatim == 0: self.flag = False if self.flag: # 退出子层td了,层数减1 self.verbatim -= 1 # def start_td(self, attrs): # if self.flag: # self.verbatim += 1 # 进入子层td了,层数加1 # return # for k, v in attrs: # 遍历div的所有属性以及其值 # if k == 'class' and v == 'info_content': # 确定进入了
# self.flag = True # return # # def end_td(self): # 遇到 # if self.verbatim == 0: # self.flag = False # if self.flag: # 退出子层td了,层数减1 # self.verbatim -= 1 def start_h1(self, attrs): if not self.flag: return self.getdata = True def end_h1(self): # 遇到

if self.getdata: self.getdata = False def start_p(self, attrs): if not self.flag: return self.getdata = True def end_p(self): # 遇到

if self.getdata: self.getdata = False def handle_data(self, text): # 处理文本 if self.getdata: self.IDlist.append(text) def printID(self, new_file): f = open(os.getcwd() + "\\txt\\" + (new_file.split('\\'))[-1] + '.txt', 'w', encoding='gbk') j = 0 for i in self.IDlist: print(i) if '。' in i or j == 0: f.write(i + '\n') j += 1 else: f.write(i ) ##import urllib2 ##import datetime ##vrg = (datetime.date(2012,2,19) - datetime.date.today()).days ##strUrl = 'http://www.nod32id.org/nod32id/%d.html'%(200+vrg) ##req = urllib2.Request(strUrl)#通过网络获取网页 ##response = urllib2.urlopen(req) ##the_page = response.read() def html_to_txt(html_list): for a in html_list: the_page = a ff = open(the_page, 'r',encoding='utf-8') html = ff.read() lister = GetIdList() lister.feed(html) lister.printID(a)

你可能感兴趣的:(python小技巧小妙招,python,html转txt,python,爬虫,html)