python 从docx文件中读取文字和图片,其中图片编码成base64格式(高中信息技术题库系统)

网络上读取docx文本的文章很多,但如何把每一自然段,逐一的存入相应的字典Key:value中;非常繁琐,需要把逻辑理的很清楚。

再一次上我的需求和方案。

python 从docx文件中读取文字和图片,其中图片编码成base64格式(高中信息技术题库系统)_第1张图片

 最终读取后形成如下格式:

[{
    "content": "输入一个正整数,输出所有的质因子。如24=2*2*2*3。实现上述功能的Python代码如下:\n\n\n\nn=int(input(″输入一个正整数:″))\n\n\n\ni=2\n\n\n\nwhile______①______:\n\n\n\n if n % i==0:\n\n\n\nn=n/i\n\n\n\nprint(i)\n\n\n\n else:\n\n\n\n______②______\n\n\n\n(1)在程序划线处填入合适的代码。\n\n\n\n(2)按照上述算法,输入60,依次输出的质因子是____________。",
    "answer": "(1)①n>1或n! =1 ②i+=1 (2)2 2 3 5",
    "explain": "最小的质因数是2,如果能被2整除,则反复相除,当不能被2整除时,将i增加1,尝试被3整除,如果还不能除通,往上增加到4,由于前面反复除2操作,因此不可能被不是质数的数除通。当相除的结果为1时,终止循环。输入60,可以被2除2次,被3除1次,被5除1次。",
    "reference": 53,
    "type": "填空题",
    "difficulty_level": "中级",
    "knowledgepoint": 11,
    "open_level": "public",
    "tags": "while循环",
    "top": false
},]

from email import contentmanager
import imp
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.image.image import Image
from docx.parts.image import ImagePart
from docx.oxml.shape import CT_Picture
from PIL import Image
from io import BytesIO
import sys
import base64
import struct

question_number_list = [str(i)+"." for i in range(1,31)]
# 建立题型选择器
text_head_filling = "、填空题"
text_head_choice = "、选择题"
text_head_SQAS = "、简答题"
type_dict = {text_head_filling:text_head_filling[1::],text_head_choice:text_head_choice[1::],
            text_head_SQAS: text_head_SQAS[1::]}
def get_picture(document: Document, paragraph:Paragraph):
    """
	document 为文档对象
	paragraph 为内嵌图片的某一个段落对象,比如第1段内
    """
    result_list=[]
    img_list = paragraph._element.xpath('.//pic:pic')
    if len(img_list)==0 or not img_list:
        return
    for i in range(len(img_list)):
        img: CT_Picture = img_list[i]
        embed = img.xpath('.//a:blip/@r:embed')[0]
        related_part: ImagePart = document.part.related_parts[embed]
        image: Image = related_part.image
        result_list.append(image)
    return result_list


def get_content(paragraph:Paragraph):
    """
	paragraph 为内嵌图片的某一个段落对象,比如第1段内
	return 字典形式的题目比如[{“content”:"按照二叉树的定义,具有
	3个节点的二叉树形态有( )A.3种B.4种C.5种D.6种","answer":"C",
	"explain":"略","type":"选择题"}
    """
    new_key = ""         #存放当前的字段名;
    new_type = ""       #存放新题型
    new_question = False    #存放当前自然段是否是新题目的开始
    
    # 设置题型
    paragraph_text = paragraph.text
    # 去掉空行
    if not paragraph_text.strip():
        return False,"","",""
    # 去掉插图的文字提示
    if paragraph_text.strip()[:1:]=="第":
        if paragraph_text.strip()[3:5]=="题图" or paragraph_text.strip()[2:4]=="题图":
            return False,"","",""   
    # 设置内容
    content = ""
    content_start = 0
    answer=""
    # 如果当前是题型的开头,即一、选择题.二、填空题或者三、简答题.等开头的,则跳过并设置题目内容的开始位置;           
    if paragraph_text[1:5:] in type_dict.keys():
        temp_type = type_dict[paragraph_text[1:5:]]
        return False,temp_type,"","" 
    # 如果当前是题目的首行,即1.2.等开头的,则跳过并设置题目内容的开始位置;
    position = paragraph_text.find(".")
    if position >0:
        if paragraph_text[position-1:position+1:1] in question_number_list or paragraph_text[position-2:position+1:1] in question_number_list:
            content_start = position+1
            new_question = True
            new_key = "content"
    # 当前段落是答案开头,则取该段落除【答案】或者【解析】字样
    position_answer = paragraph_text.strip().find("【答案】")
    position_explain = paragraph_text.strip().find("【解析】")
    # 该段为答案段
    if position_answer>=0:
        new_key = "answer"
        content_start= position_answer+5
    # 该段为解释
    elif position_explain>=0:
        new_key = "explain"
        content_start = position_explain+4
    
    content = paragraph_text[content_start::]
    return new_question,new_type,new_key,content
             

def ReadDocx2List(d : Document):    
    start_row = 0
    paragraph_text=""
    current_key = "content"
    current_type = "选择题"     #当前的题型
    questions_list = []          #存放所有题目
    new_question = False
    question_dict={"content":"","answer":"","explain":"","type":"","pictures":""}
    data_list=list()
    first = True
    #从第一题开始收集题目
    for start_row in range(len(d.paragraphs)):
        paragraph = d.paragraphs[start_row]
        if paragraph.text.strip()[1:5:] in type_dict.keys():
            break
    # 读取图片
    for i in range(start_row,len(d.paragraphs)):
        paragraph = d.paragraphs[i]        
        image_list = get_picture(d, paragraph)
        if image_list:
            for image in image_list:
                if image:
                    # 后缀
                    ext = image.ext
                    # 二进制内容
                    blob = image.blob
                    # 显示图片
                    Image.open(BytesIO(blob)).show()
                    img_stream = base64.b64encode(blob)
                    bs64 = "data:image/jpeg;base64," + img_stream.decode('utf-8')
                    if question_dict["pictures"] == "":
                        question_dict["pictures"] = bs64
                    else:
                        question_dict["pictures"] += "-" + bs64
                    print(bs64)
            continue
        
        # 获取所有的文本内容        
        new_question,temp_type,temp_key,result_text = get_content(paragraph)
        print(result_text)
        #题型的转换,只需要更改题型的key,不需要做做其他的任何操作
        if temp_type !="":            
            #设置在此行以后的题型
            current_type = temp_type            
            current_key="content"   #初始化从内容开始存放,可能跟新题首行的设置重复
            continue
        else:
            #有新的key,即有新内容
            if temp_key != "":
                #如果是题目的首行,即出现新题目,则分为第1个题目,还是非第1题
                if new_question:
                    # 如果是第一题,则只需要把该段文本存入content字段,非设置变量first为非第一题
                    if first:
                        first = False
                        question_dict['type'] = current_type
                        question_dict[current_key] = result_text + "\n"
                    # 当前非第1题,先结算上一题到questions_list中,再初始化题目字典,并把当前行文本添加到content字段中
                    else:                        
                        questions_list.append(question_dict)
                        question_dict={"content":"","answer":"","explain":"","type":current_type,"pictures":""}
                        current_key="content"       #初始化从内容开始存放                        
                        question_dict[current_key] += result_text + "\n" 
                # 非题目的首段,且有新的key,比如遇到答案或解析,设置新的key,并把内容添加到给字典key的value中
                else:
                    current_key=temp_key
                    question_dict[current_key] +=result_text+ "\n"
            # 没有新的key,且有内容,意味着是当前Key的换行,比如题目多自然段,答案或者解析的换行多自然段
            elif result_text !="":
                question_dict[current_key] +=result_text+ "\n"
    questions_list.append(question_dict)
    return questions_list
    # print(questions_list)
if __name__ =="__main__":
    d = docx.Document('test.docx')
    data_list = ReadDocx2List(d)
    print(data_list)

代码中的注释比较详细了。有疑问的请请留言,互相探讨

你可能感兴趣的:(django,python,python,开发语言)