有五个章节的word作业需要修改,问题在于每个同学都有自己的风格,如下:
有横线的选项(情况五)
当然还包括该同学这道题根本没有选答案,列举的情况并不包括所有的情况,故编码起来很繁琐,得同时考虑到很多种情况。
一开始我只看到了有横线的情况,所以我的方法是:
用正则表达式去匹配ABCD答案,这样的方法在有横线的(一、三、四、五)情况中都不满足,只能说是天真了。
regu_lin_ans_lin = r"_[A-D]_"
pt_l_a_l = re.compile(regu_lin_ans_lin)
若继续加上匹配 [A-D] 和 [A-D] 的情况也只是能匹配多一点的情况了。
既然不能满足横线之外,那么现在我们只要检测到横线就能检测到答案ABCD了,所以分为两步:
1、先去检测横线;
2、然后检测当前行是否存在ABCD(可能会存在误差,因为题目中也可能涉及ABCD)
regu_ans = '[A-D]'
pt_a = re.compile(regu_ans)
text = para.text
if '_' in text:
ans = pt_a.findall(text)
if len(ans) == 0:
answers.append(-1) # if not detected marked as -1
is_mark = True
continue
ans = ans[0]
tmp = ch2num(ans)
answers.append(tmp)
index += 1
先检测出:第X题
然后去寻找给出的选项答案ABCD,当然应该在题目所给的A.xxxxx选项之前的ABCD字母。
这里就涉及到若某一题目根本没有做的情况。
text = "在Excel中,下面对于自定义自动筛选说法中不正确的是" # ___C_____。
lin_ans_lin = r"_[A-D]_"
pt = re.compile(lin_ans_lin)
lin_ans_lin = pt.findall(text)
print(lin_ans_lin)
f = open('第五章数据管理与分析-171xxxx-xxx.docx', 'rb')
doc = Document(f)
print(doc)
index = 1
for para in doc.paragraphs:
print(str(index) + '段\t' + para.text)
index += 1
将成绩填入对应的位置
import xlrd
import os
import re
from xlutils.copy import copy
excel_file_path = 'excel/课堂测试成绩.xlsx'
# step 1: 复制并且获取第一个sheet
data = xlrd.open_workbook(excel_file_path)
# 拷贝一份原来的excel
book_new = copy(data)
sheet_new = book_new.get_sheet(0)
table = data.sheets()[0]
print(table.nrows)
# step 2: 建立学号与行号的对应,方便后续的操作
dic_stuid_row_num = {}
for i in range(table.nrows):
if i == 0:
continue
text = table.row_values(i)
dic_stuid_row_num[text[0]] = i
# print(dic_stuid_row_num)
# step 3: 遍历文件,将对应的成绩填入excel
re_num = r"\d+"
pt_num = re.compile(re_num)
files = ['chapter2.txt', 'chapter3.txt', 'chapter4.txt', 'chapter5.txt']
sub_nums = [32, 45, 30, 30]
base_dir = 'files'
for index in range(len(files)):
path = os.path.join(base_dir, files[index])
with open(path, 'r', encoding='UTF-8') as f:
lines = f.readlines()
for line in lines:
rs = pt_num.findall(line)
print(rs)
stu_id = rs[0]
score = int(rs[1]) / sub_nums[index]
if stu_id in dic_stuid_row_num.keys():
row_num = dic_stuid_row_num[stu_id]
sheet_new.write(row_num, index + 3, '%.2f' % (score * 100))
book_new.save('课堂测试成绩.xls')
# 计算正确的答案个数(需事先给出答案)
import os
import re
from python_docx_tutorial.ans_extractor import extract
from python_docx_tutorial.score_counter import sc_count
if __name__ == '__main__':
result = {}
marked_file = []
# 章节
# 第五章
# true_answers = [1, 4, 4, 2, 3, 2, 4, 3, 4, 4,
# 2, 3, 3, 4, 3, 3, 1, 1, 3, 4,
# 3, 3, 3, 2, 2, 1, 4, 3, 4, 4]
# base_dir = 'C:\\Users\\lenovo02\\Documents\\WeChat Files\\Zipcoder\\Files\\第五章\\学生提交'
# sub_num = 30
# 第四章
# true_answers = [2, 1, 4, 2, 4, 1, 4, 2, 1, 4,
# 1, 2, 2, 1, 3, 1, 3, 4, 4, 4,
# 1, 3, 1, 2, 3, 4, 2, 3, 3, 2]
# base_dir = 'G:\\test\\课堂测试\\第四章\\课堂测试-学生提交'
# sub_num = 30
# 第三章 (多选题和填空题直接给分,不然要加太多的事物逻辑)
# true_answers = [-1, 2, -1, 2, -1, -1, 2, 1, 1, 1,
# -1, 1, 2, 3, 1, 2, -1, 1, -1, -1,
# 1, 2, 2, -1, 1, 1, 1, 1, -1, 1,
# 2, 1, 1, 2, 2, 2, 1, 1, 1, 1,
# 2, 2, 3, -1, 4]
# base_dir = 'G:\\test\\课堂测试\\第三章\\学生提交-课堂测试'
# sub_num = 45
# 第二章 (和第三章一样,多选填空直接给分,由于选项太多所以最后的几道题全不给分)
true_answers = [1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1, 1, 2, 2, 2, 2, 3, 1, 4, 4,
-1, -1, -1, -1, -1, -2, -2, -2, -2, -2,
-2, -2]
base_dir = 'G:\\test\\课堂测试\\第二章\\提交'
sub_num = 32
# 正则表达式初始化
regu_stu_id = r"[0-9]"
pt_stu_id = re.compile(regu_stu_id)
sec_dirs_files = os.listdir(base_dir)
for filename in sec_dirs_files:
pathname = os.path.join(base_dir, filename)
if os.path.isdir(pathname):
# true file name
file_name = os.listdir(pathname)[0]
file_path = os.path.join(pathname, file_name)
# print(file_path.title())
stu_id = pt_stu_id.findall(file_name)
stu_id = ''.join(stu_id)
with open(file_path, 'rb') as f:
try:
[answers, marked] = extract(f, sub_num)
except Exception as e:
marked_file.append(file_path)
continue
# assert for the num
# if len(answers) != sub_num:
# print('not correctly detect the subject num! answers number {}'.format(len(answers)))
if marked:
marked_file.append(file_path)
continue
if len(answers) <= sub_num - 10:
print('有答案题目少于给定阈值,为{}!'.format(len(answers)))
# count the score
score = sc_count(true_answers, answers)
print('学号:{},成绩:{}'.format(stu_id, score))
result[stu_id] = score # can be score / sub_num
else:
file_path = os.path.join(base_dir, filename)
stu_id = pt_stu_id.findall(filename)
stu_id = ''.join(stu_id)
with open(file_path, 'rb') as f:
[answers, marked] = extract(f, sub_num)
# assert for the num
# if len(answers) != sub_num:
# print('not correctly detect the subject num! answers number {}'.format(len(answers)))
if marked:
marked_file.append(file_path)
if len(answers) <= sub_num - 10:
print('有答案题目少于给定阈值,为{}!'.format(len(answers)))
# count the score
score = sc_count(true_answers, answers)
print('学号:{},成绩:{}'.format(stu_id, score))
result[stu_id] = score # can be score / sub_num
rs_index = sorted(result.keys())
print('有成绩的人数:{}'.format(len(result)))
print(result)
for r in rs_index:
print('学号:{}, 成绩:{}'.format(r, result[r]))
print('标注文档数目:{}'.format(len(marked_file)))
for f in marked_file:
print(f)
from docx import Document
import re
def ch2num(ch):
"""
转化为对应的数字,方便后续计算
:param ch:
:return:
"""
dic = {'A': 1, 'B': 2, 'C': 3, 'D': 4}
return dic[ch]
def extract(f, total_sub):
"""
:param f: 文件(已打开文件)
:param total_sub: 总共的题目数量
:return:
"""
# save ans for every one
answers = []
is_mark = False
# step 1: init re lib
regu_sub = r"第\d+题"
regu_num = r"\d+"
regu_lin_ans_lin = r"_[A-D]"
regu_ans = '[A-D]'
pt_l_a_l = re.compile(regu_lin_ans_lin)
pt_a = re.compile(regu_ans)
pt_sub = re.compile(regu_sub)
pt_sub_num = re.compile(regu_num)
# step 2: search the doc for answers like a b c d
print('开始读取文档:{}'.format(f.name))
doc = Document(f)
index = 0
is_find_sub = False
is_find_ans = False
sub_num = 1
for para in doc.paragraphs:
# 由于个个人的文档不规范,这里强制使用另外的蠢方法
# lin_ans_lin = pt_l_a_l.findall(para.text)
# print(para.text)
# if len(lin_ans_lin) == 1: # if one answers catch
# index += 1
# ans = pt_a.findall(lin_ans_lin[0])[0]
# tmp = ch2num(ans)
# answers.append(tmp)
# # print the index and corresponding answer
# # print('抽取到第{}答案:{}'.format(index, ans))
# 蠢方法1:(判断有横线,然后提取ABCD)
# text = para.text
# if '_' in text:
# ans = pt_a.findall(text)
# if len(ans) == 0:
# answers.append(-1) # if not detected marked as -1
# is_mark = True
# continue
# ans = ans[0]
# tmp = ch2num(ans)
# answers.append(tmp)
# index += 1
# 蠢方法2:找到模型: "第[0-9]题", 然后找到第一个出现的字母
text = para.text
# print(text)
if text.strip().startswith('A'): # if reach the A........ then stop to find the answer
is_find_sub = False
subject = pt_sub.findall(text)
if len(subject) == 1: # find subject
is_find_sub = True
sub_num = pt_sub_num.findall(subject[0])
if not is_find_ans and len(answers) != 0: # if not find the corresponding answer set -1 instead
answers.append(-1)
if is_find_sub:
ans = pt_a.findall(text)
if len(ans) >= 1:
# if index == sub_num:
ans = ans[-1]
# print('题目:{},检测序号:{}, 答案:{}'.format(sub_num, index + 1, ans))
tmp = ch2num(ans)
answers.append(tmp)
is_find_sub = False
is_find_ans = True
index += 1
# step 3:
if index >= total_sub - 10: # if the detected num is reach a level , not to review
is_mark = False
if is_mark:
print('mark, 文件名:{}'.format(f.name))
# step 4: judge the index num and given total subject num
if index >= total_sub - 10:
print("抽取完毕,个数为:{}".format(index))
return [answers, False]
else:
print("题目数量差太多,检查文档!")
print('检测到的题目个数为:{}'.format(len(answers)))
return [answers, True]
if __name__ == '__main__':
file = open('C:\\Users\\lenovo02\\Documents\\WeChat Files\\Zipcoder\\Files\\第五章\\学生提交\\F110_192.168.117.110\\18120318叶宜宁.docx', 'rb')
extract(file, 30)
def sc_count(a, b):
"""
:param a: 基准
:param b: 待测
:return:
"""
count = 0
for i in range(len(b)):
if a[i] == -1: # 标注-1 直接给分
count += 1
continue
if a[i] == -2: # 标注-2 直接pass
continue
if b[i] == -1:
continue
else:
# print(i)
count += 1 if a[i] == b[i] else 0
return count
https://github.com/finepix/py_workspace/tree/master/python_docx_tutorial