python读取只读word只读_python读取word文档识别字段颜色,解析字段

python版本3.7.3,读取的文档格式为.docx

文中带有简单注释

python读取只读word只读_python读取word文档识别字段颜色,解析字段_第1张图片

看不懂的百度网盘下载直接查看,更改运行里面的py文件

  • 网盘下载
  • 提取码:nngw
python读取只读word只读_python读取word文档识别字段颜色,解析字段_第2张图片
import osimport sysimport xlrdimport codecsimport collectionsimport jsonimport ioimport docximport stringfrom docx import Documentfrom docx.shared import RGBColor #这个是docx的颜色类 maxLength = 0id = 1convert_list = []type_list = []curPath = os.path.dirname(os.path.abspath(__file__))# coding=utf-8#获取文档对象def readDocx(fileName,type):    xlsFile = curPath + ''+fileName+'.docx'   #地理(葡)Respueda G .es.pt    print("xlsFile: "+xlsFile)    file=docx.Document(xlsFile)    # print("段落数:"+str(len(file.paragraphs)))    index = 0    data = {}    i = 0    global id    global maxLength    for p in file.paragraphs:        i = i + 1        if i <= 1:  #跳过第一行            continue        if p.text == "" or (not p.text.strip()):            continue        # print("读取第 "+str(i)+" 行,文件名:"+fileName+" ID:"+str(id)+"  内容:"+p.text)        if index == 0: #提取题目            # print(p.text.find("-"),"题的内容是:", p.text)            length = len(p.text)            idx = p.text.find("Número")            if idx != -1 and idx < 2:                idx = idx + len("Número") + 1                # print("Número: "+str(idx)+"   text: "+p.text)                p.text = p.text[idx:(length)]                # print("Número: "+str(idx)+"   text: "+p.text)                        indexStr = "-" #分隔符            if p.text.find(indexStr) == -1:                indexStr = "."                if p.text.find(indexStr) == -1:                    indexStr = " "            # print("题的内容是:", p.text)            idx = p.text.index(indexStr)+len(indexStr)            length = len(p.text)            if length > maxLength:                maxLength = length                # print(id,"最大字符数",maxLength)            # print(str(idx)+str(length)+"第"+str(id)+"题的内容是:"+p.text)            questionAndsubType = p.text[idx:(length)]            questionAndsubTypeList = questionAndsubType.split("|")            data["question"] = questionAndsubTypeList[0] #题目                        # if len(questionAndsubTypeList) > 1 : #类型                # subType = questionAndsubTypeList[1].replace("", "")                # print("---类型---",type_list.count(subType))                # if type_list.count(subType) <= 0 :                    # type_list.append(subType)            data["subType"] = type#escape(subType)  #类型        else:   #提取选项,以及正确答案            # print("第"+str(id)+"题    选项"+ str(index) +"是:"+p.text)            length = len(p.text)            for n in p.runs:                rgb = str(n.font.color.rgb) #读取段落颜色                # print("runs"+rgb)                if rgb == "00FF00":                    # print("正确答案: ",index)                    data["rightIndex"] = index            #删除段落中不必要文字            idx = p.text.find("(Direito)")            if idx != -1:                p.text = p.text[0:idx]                            idx = p.text.find("(Correcta)")            if idx != -1:                p.text = p.text[0:idx]                            idx = p.text.find("(Right)")            if idx != -1:                p.text = p.text[0:idx]                            idx = p.text.find("(Correct)")            if idx != -1:                p.text = p.text[0:idx]            #删除段落中不必要文字                        data["option"+str(index)] = p.text        index = index + 1        if index >= 5:            data["_id"] = id            # print("data: "+str(data))            convert_list.append(data)            index = 0            id = id + 1            data = {}def writeDocx(fileList,name):    global id    global convert_list    global type_list    id = 1    convert_list = []    type_list = []        for p in fileList:        readDocx(p["path"],p["type"])    #题库    jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径    dirname = os.path.dirname(jsonPath)    if not os.path.exists(dirname):        os.makedirs(dirname)    with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入        f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))def main():    en_fileList =  [{"path":"en_us_topic地理(英)Respueda G .es.en",          "type":"World"},                    {"path":"en_us_topic科学与技术(英)",                    "type":"Technology"},                    {"path":"en_us_topic历史(英)Resupeda H.es.en",           "type":"History"},                    {"path":"en_us_topic艺术和文学(英)Respueda A&L.es.en",  "type":"ArtAndLiterature"},                    {"path":"en_us_topic娱乐(英)Respueda E.es.en",           "type":"Fashion"},                    {"path":"en_us_topic运动(英)Respueda  D.es.en",          "type":"Sports"}]    en_name = "en_us_topic"    es_fileList =  [{"path":"es_es_topic地理(西)Respueda G ",                "type":"World"},                    {"path":"es_es_topic科学与技术(西)Respueda C&T",            "type":"Technology"},                    {"path":"es_es_topic历史(西)Resupeda H",                 "type":"History"},                    {"path":"es_es_topic艺术和文学(西)Respueda A&L",        "type":"ArtAndLiterature"},                    {"path":"es_es_topic娱乐(西)Respueda E",                 "type":"Fashion"},                    {"path":"es_es_topic运动(西)Respueda  D",                "type":"Sports"}]    es_name = "es_es_topic"    pt_fileList =  [{"path":"pt_br_topic地理(葡)Respueda G .es.pt",          "type":"World"},                    {"path":"pt_br_topic科学与技术(葡)",                    "type":"Technology"},                    {"path":"pt_br_topic历史(葡)Resupeda H.es.pt",           "type":"History"},                    {"path":"pt_br_topic艺术和文学(葡)Respueda A&L.es.pt",  "type":"ArtAndLiterature"},                    {"path":"pt_br_topic娱乐(葡)Respueda E.es.pt",           "type":"Fashion"},                    {"path":"pt_br_topic运动(葡)Respueda  D.es.pt",          "type":"Sports"}]    pt_name = "pt_br_topic"    writeDocx(pt_fileList,pt_name)    writeDocx(es_fileList,es_name)    writeDocx(en_fileList,en_name)    main()

有什么问题欢迎大家评论区留言讨论,都看到这了,别忘了点关注哦!

你可能感兴趣的:(python读取只读word只读_python读取word文档识别字段颜色,解析字段)