# 执行分析文件夹的部分
allFileNum = 0
# 文件夹对象(字典)
dir_dict = defaultdict(list)
def print_path(level, path, father, f_id, version, db_session):
global dir_dict
'''''
打印一个目录下的所有文件夹和文件
'''
dir_list = []
file_list = []
# 对文件进行排序
files = os.listdir(path)
files.sort()
dir_list.append(str(level))
for f in files:
if os.path.isdir(path + '/' + f):
# 排除隐藏文件夹和无用文件
if f[0] == '.' or f[0].startswith('__'):
pass
else:
dir_list.append(f)
if os.path.isfile(path + '/' + f) and f.endswith('.md') and 'README' not in f:
file_path = path + '/' + f
with open(path + '/' + f, 'rb') as j:
# 计算大小,和获取唯一值(判断文件是否有改动的情况),md5值变化即文件改动
content = j.read()
size = len(content)
md5_value = md5_file_content(content)
# 创建文件,包括文件名,文件路径,文件大小,文件版本,文件唯一值
id_ = File.create_file(f, str(file_path), size/1024, version, md5_value, db_session=db_session)
file_list.append((path + '/' + f, f, id_))
# 解析单个文件函数
run(path + '/' + f, f, version, id_)
# 解析文件夹
i_dl = 0
for dl in dir_list:
# 文件夹的层级加1
if i_dl == 0:
i_dl = i_dl + 1
else:
# 将文件夹的层级,地址,名称,父级文件夹,父级文件夹id存进数据库
res = Directory.create_dir(dl, int(dir_list[0]), path, father, f_id, version, db_session=db_session)
dir_dict[int(dir_list[0])].append(dl)
# 递归调用
file_l = print_path((int(dir_list[0]) + 1), path + '/' + dl, dl, res.id, version, db_session)
file_list.extend(file_l)
return file_list
# 解析文件的部分
def run(source_file, name, version, id_):
# 获取文件名
is_code = False # 是否是代码
is_t = False # 是否是表格
file_name = source_file
db_session = session()
# 获取文件后缀
_, suffix = os.path.splitext(file_name)
if suffix not in [".md", ".markdown"]:
return 'Error: the file should be in markdown format'
# 记录文件的进度,执行一次run,就表示解析了一个文件,使用 redis 自加一
pipe.incr('analysis_file_complete_count', 1)
pipe.execute()
f = open(file_name, "r", encoding='utf-8')
content = None
content_c = None
mark = ''
# 逐行解析 markDown 文件
for line in f:
result, result1, content_c, content, mark = parse(line, is_code, name, version, db_session, id_, is_t,
mark, content, content_c)
is_code = result
is_t = result1
content = content
content_c = content_c
mark = mark
if content:
FileContent.create_content(name, json.dumps(content), 5, version, id_, db_session=db_session)
f.close()
db_session.close()
# 解析部分
def parse(input_, is_code, name, version, db_session, id_, is_t, mark='', content=None, content_c=None):
# 解析代码块, mark 是代码类型标记
is_c, content_c, mark = test_code(input_, is_code, name, version, db_session, id_, mark, content_c=content_c)
# 解析图像链接
if not is_c:
res = test_link(input_, name, version, db_session, id_)
if not res:
# 解析标题
res1 = test_header(input_, name, version, db_session, id_)
if not res1:
# 解析表格
is_t, content = test_table(input_, name, version, db_session, id_, is_t, content=content)
if not is_t:
res3 = test_list(input_, name, version, db_session, id_)
if not res3:
test_normal(input_, name, version, db_session, id_)
return is_c, is_t, content_c, content, mark
# 匹配标题部分
def test_header(input_, name, version, db_session, id_):
# 分析标题标记
title_rank = 0
for i in range(6, 0, -1):
if input_[:i] == '#' * i:
title_rank = i
break
else:
return 0
input_ = input_.replace('**', '')
# 记录数据库
FileContent.create_content(name, json.dumps([title_rank, input_[title_rank:]]), 4, version, id_,
db_session=db_session)
return 1
def test_normal(input_, name, version, db_session, id_):
# 匹配空行
input_bold = ''
if input_ == '\n':
# FileContent.create_content(name, input_, 1, version, id_, db_session=db_session)
return
if '``' in input_:
pass
else:
# 匹配加粗,并给加粗加上strong标签,前端懒,后端加上
test_pattern = re.compile(r'(.*)\*\*(.*)\*\*(.*)')
test_match = test_pattern.match(input_)
pattern = re.compile(r'(.*?)\*\*(.*?)\*\*(.*?)')
match = pattern.finditer(input_)
if match:
try:
for i in match:
a = i.group(1)
b = i.group(2)
c = i.group(3)
input_bold += a + '' + b + '' + c
# 拼接
input_bold += test_match.group(3)
input_ = input_bold
except:
pass
input_last = input_.replace(' ', '')
FileContent.create_content(name, input_last, 2, version, id_, db_session=db_session)
# 匹配代码块
def test_code(input_, is_code, name, version, db_session, id_, mark, content_c=None):
# 匹配代码块
if not content_c:
content_c = []
pattern = re.compile(r'```(.*)\n')
a = pattern.match(input_)
if a or is_code:
if a and is_code:
is_code = False
FileContent.create_content(name, json.dumps(content_c), 3, version, id_, marks=mark, db_session=db_session)
content_c = []
else:
is_code = True
try:
mark = a.group(1)
except:
pass
if not a:
content_c.append(input_)
return is_code, content_c, mark
return is_code, content_c, mark
def all_same(lst, sym):
# 匹配分割线
return not lst or sym * len(lst) == lst
def test_list(input_, name, version, db_session, id_):
# 解析有序序列
if len(input_) > 2 and input_[0].isdigit() and input_[1] == '.':
result = input_[2:]
FileContent.create_content(name, result, 8, version, id_, db_session=db_session)
return 1
# 分析分割线标记
if len(input_) > 2 and all_same(input_[:-1], '-') and input_[-1] == '\n':
FileContent.create_content(name, input_, 10, version, id_, db_session=db_session)
return 1
# 解析无序列表
if input_ != "" and input_[0] in ['+', '-']:
result = input_[1:]
FileContent.create_content(name, result, 9, version, id_, db_session=db_session)
return 1
return 0
def test_table(input_, name, version, db_session, id_, is_t, content=None):
# 解析表格
if not content:
content = defaultdict(list)
pattern = re.compile(r'^(.*)\|(.+)$')
match = pattern.match(input_)
if match:
l = input_.split('|')
l[-1] = l[-1][:-1]
# 将空字符弹出列表
if l[0] == '':
l.pop(0)
if l[-1] == '':
l.pop(-1)
if '--' in l[0]:
return is_t, content
if not is_t:
content['th'].append(l)
else:
content['td'].append(l)
is_t = True
else:
is_t = False
if not is_t and content:
FileContent.create_content(name, json.dumps(content), 5, version, id_, db_session=db_session)
content = None
return is_t, content
# 处理链接
def test_link(s, name, version, db_session, id_):
# 超链接
pattern = re.compile(r'(.*)\[(.*?)\]\((.*?)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1, 2)
FileContent.create_content(name, json.dumps([text, url]), 7, version, id_, db_session=db_session)
return 1
# 图像链接
pattern = re.compile(r'^!\[(.*)\]\((.*)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1, 2)
FileContent.create_content(name, json.dumps([text, url]), 6, version, id_, db_session=db_session)
return 1
return 0
其中is_c 或者is_t 或者is_code之类的,是因为代码块表格是一个多行标记,而解析的时候是按行解析,这样,就必须给代码块或者表格加上一个标记,标记这是一个列表,和一个代码块。