网络上读取docx文本的文章很多,但如何把每一自然段,逐一的存入相应的字典Key:value中;非常繁琐,需要把逻辑理的很清楚。
再一次上我的需求和方案。
最终读取后形成如下格式:
[{
"content": "输入一个正整数,输出所有的质因子。如24=2*2*2*3。实现上述功能的Python代码如下:\n\n\n\nn=int(input(″输入一个正整数:″))\n\n\n\ni=2\n\n\n\nwhile______①______:\n\n\n\n if n % i==0:\n\n\n\nn=n/i\n\n\n\nprint(i)\n\n\n\n else:\n\n\n\n______②______\n\n\n\n(1)在程序划线处填入合适的代码。\n\n\n\n(2)按照上述算法,输入60,依次输出的质因子是____________。",
"answer": "(1)①n>1或n! =1 ②i+=1 (2)2 2 3 5",
"explain": "最小的质因数是2,如果能被2整除,则反复相除,当不能被2整除时,将i增加1,尝试被3整除,如果还不能除通,往上增加到4,由于前面反复除2操作,因此不可能被不是质数的数除通。当相除的结果为1时,终止循环。输入60,可以被2除2次,被3除1次,被5除1次。",
"reference": 53,
"type": "填空题",
"difficulty_level": "中级",
"knowledgepoint": 11,
"open_level": "public",
"tags": "while循环",
"top": false
},]
from email import contentmanager
import imp
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO
import sys
import base64
import struct
question_number_list = [str(i)+"." for i in range(1,31)]
# 建立题型选择器
text_head_filling = "、填空题"
text_head_choice = "、选择题"
text_head_SQAS = "、简答题"
type_dict = {text_head_filling:text_head_filling[1::],text_head_choice:text_head_choice[1::],
text_head_SQAS: text_head_SQAS[1::]}
def get_picture(document: Document, paragraph:Paragraph):
"""
document 为文档对象
paragraph 为内嵌图片的某一个段落对象,比如第1段内
"""
result_list=[]
img_list = paragraph._element.xpath('.//pic:pic')
if len(img_list)==0 or not img_list:
return
for i in range(len(img_list)):
img: CT_Picture = img_list[i]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part: ImagePart = document.part.related_parts[embed]
image: Image = related_part.image
result_list.append(image)
return result_list
def get_content(paragraph:Paragraph):
"""
paragraph 为内嵌图片的某一个段落对象,比如第1段内
return 字典形式的题目比如[{“content”:"按照二叉树的定义,具有
3个节点的二叉树形态有( )A.3种B.4种C.5种D.6种","answer":"C",
"explain":"略","type":"选择题"}
"""
new_key = "" #存放当前的字段名;
new_type = "" #存放新题型
new_question = False #存放当前自然段是否是新题目的开始
# 设置题型
paragraph_text = paragraph.text
# 去掉空行
if not paragraph_text.strip():
return False,"","",""
# 去掉插图的文字提示
if paragraph_text.strip()[:1:]=="第":
if paragraph_text.strip()[3:5]=="题图" or paragraph_text.strip()[2:4]=="题图":
return False,"","",""
# 设置内容
content = ""
content_start = 0
answer=""
# 如果当前是题型的开头,即一、选择题.二、填空题或者三、简答题.等开头的,则跳过并设置题目内容的开始位置;
if paragraph_text[1:5:] in type_dict.keys():
temp_type = type_dict[paragraph_text[1:5:]]
return False,temp_type,"",""
# 如果当前是题目的首行,即1.2.等开头的,则跳过并设置题目内容的开始位置;
position = paragraph_text.find(".")
if position >0:
if paragraph_text[position-1:position+1:1] in question_number_list or paragraph_text[position-2:position+1:1] in question_number_list:
content_start = position+1
new_question = True
new_key = "content"
# 当前段落是答案开头,则取该段落除【答案】或者【解析】字样
position_answer = paragraph_text.strip().find("【答案】")
position_explain = paragraph_text.strip().find("【解析】")
# 该段为答案段
if position_answer>=0:
new_key = "answer"
content_start= position_answer+5
# 该段为解释
elif position_explain>=0:
new_key = "explain"
content_start = position_explain+4
content = paragraph_text[content_start::]
return new_question,new_type,new_key,content
def ReadDocx2List(d : Document):
start_row = 0
paragraph_text=""
current_key = "content"
current_type = "选择题" #当前的题型
questions_list = [] #存放所有题目
new_question = False
question_dict={"content":"","answer":"","explain":"","type":"","pictures":""}
data_list=list()
first = True
#从第一题开始收集题目
for start_row in range(len(d.paragraphs)):
paragraph = d.paragraphs[start_row]
if paragraph.text.strip()[1:5:] in type_dict.keys():
break
# 读取图片
for i in range(start_row,len(d.paragraphs)):
paragraph = d.paragraphs[i]
image_list = get_picture(d, paragraph)
if image_list:
for image in image_list:
if image:
# 后缀
ext = image.ext
# 二进制内容
blob = image.blob
# 显示图片
Image.open(BytesIO(blob)).show()
img_stream = base64.b64encode(blob)
bs64 = "data:image/jpeg;base64," + img_stream.decode('utf-8')
if question_dict["pictures"] == "":
question_dict["pictures"] = bs64
else:
question_dict["pictures"] += "-" + bs64
print(bs64)
continue
# 获取所有的文本内容
new_question,temp_type,temp_key,result_text = get_content(paragraph)
print(result_text)
#题型的转换,只需要更改题型的key,不需要做做其他的任何操作
if temp_type !="":
#设置在此行以后的题型
current_type = temp_type
current_key="content" #初始化从内容开始存放,可能跟新题首行的设置重复
continue
else:
#有新的key,即有新内容
if temp_key != "":
#如果是题目的首行,即出现新题目,则分为第1个题目,还是非第1题
if new_question:
# 如果是第一题,则只需要把该段文本存入content字段,非设置变量first为非第一题
if first:
first = False
question_dict['type'] = current_type
question_dict[current_key] = result_text + "\n"
# 当前非第1题,先结算上一题到questions_list中,再初始化题目字典,并把当前行文本添加到content字段中
else:
questions_list.append(question_dict)
question_dict={"content":"","answer":"","explain":"","type":current_type,"pictures":""}
current_key="content" #初始化从内容开始存放
question_dict[current_key] += result_text + "\n"
# 非题目的首段,且有新的key,比如遇到答案或解析,设置新的key,并把内容添加到给字典key的value中
else:
current_key=temp_key
question_dict[current_key] +=result_text+ "\n"
# 没有新的key,且有内容,意味着是当前Key的换行,比如题目多自然段,答案或者解析的换行多自然段
elif result_text !="":
question_dict[current_key] +=result_text+ "\n"
questions_list.append(question_dict)
return questions_list
# print(questions_list)
if __name__ =="__main__":
d = docx.Document('test.docx')
data_list = ReadDocx2List(d)
print(data_list)
代码中的注释比较详细了。有疑问的请请留言,互相探讨