解析markdown文件

第一部分,解析文件夹的部分,将所有的文件夹和文件解析出来:代码如下:

# 执行分析文件夹的部分
allFileNum = 0
# 文件夹对象(字典)
dir_dict = defaultdict(list)


def print_path(level, path, father, f_id, version, db_session):
    global dir_dict
    ''''' 
    打印一个目录下的所有文件夹和文件 
    '''
    dir_list = []
    file_list = []
    # 对文件进行排序
    files = os.listdir(path)
    files.sort()
    dir_list.append(str(level))
    for f in files:
        if os.path.isdir(path + '/' + f):
            # 排除隐藏文件夹和无用文件
            if f[0] == '.' or f[0].startswith('__'):
                pass
            else:
                dir_list.append(f)
        if os.path.isfile(path + '/' + f) and f.endswith('.md') and 'README' not in f:
            file_path = path + '/' + f
            with open(path + '/' + f, 'rb') as j:
                # 计算大小,和获取唯一值(判断文件是否有改动的情况),md5值变化即文件改动
                content = j.read()
                size = len(content)
                md5_value = md5_file_content(content)
            # 创建文件,包括文件名,文件路径,文件大小,文件版本,文件唯一值
            id_ = File.create_file(f, str(file_path), size/1024, version, md5_value, db_session=db_session)
            file_list.append((path + '/' + f, f, id_))
            # 解析单个文件函数
            run(path + '/' + f, f, version, id_)

    # 解析文件夹
    i_dl = 0
    for dl in dir_list:
        # 文件夹的层级加1
        if i_dl == 0:
            i_dl = i_dl + 1
        else:
            # 将文件夹的层级,地址,名称,父级文件夹,父级文件夹id存进数据库
            res = Directory.create_dir(dl, int(dir_list[0]), path, father, f_id, version, db_session=db_session)

            dir_dict[int(dir_list[0])].append(dl)
            # 递归调用
            file_l = print_path((int(dir_list[0]) + 1), path + '/' + dl, dl, res.id, version, db_session)
            file_list.extend(file_l)
    return file_list

第二部分,是run函数,解析Markdown文件的部分。代码如下

# 解析文件的部分
def run(source_file, name, version, id_):
    # 获取文件名
    is_code = False  # 是否是代码
    is_t = False  # 是否是表格
    file_name = source_file
    db_session = session()
    # 获取文件后缀
    _, suffix = os.path.splitext(file_name)
    if suffix not in [".md", ".markdown"]:
        return 'Error: the file should be in markdown format'
    # 记录文件的进度,执行一次run,就表示解析了一个文件,使用 redis 自加一
    pipe.incr('analysis_file_complete_count', 1)
    pipe.execute()
    f = open(file_name, "r", encoding='utf-8')
    content = None
    content_c = None
    mark = ''

    # 逐行解析 markDown 文件
    for line in f:
        result, result1, content_c, content, mark = parse(line, is_code, name, version, db_session, id_, is_t,
                                                          mark, content, content_c)
        is_code = result
        is_t = result1
        content = content
        content_c = content_c
        mark = mark
    if content:
        FileContent.create_content(name, json.dumps(content), 5, version, id_, db_session=db_session)

    f.close()
    db_session.close()


# 解析部分
def parse(input_, is_code, name, version, db_session, id_, is_t, mark='', content=None, content_c=None):
    # 解析代码块, mark 是代码类型标记
    is_c, content_c, mark = test_code(input_, is_code, name, version, db_session, id_, mark, content_c=content_c)
    # 解析图像链接
    if not is_c:
        res = test_link(input_, name, version, db_session, id_)
        if not res:
            # 解析标题
            res1 = test_header(input_, name, version, db_session, id_)
            if not res1:
                # 解析表格
                is_t, content = test_table(input_, name, version, db_session, id_, is_t, content=content)
                if not is_t:
                    res3 = test_list(input_, name, version, db_session, id_)
                    if not res3:
                        test_normal(input_, name, version, db_session, id_)

    return is_c, is_t, content_c, content, mark


# 匹配标题部分
def test_header(input_, name, version, db_session, id_):
    # 分析标题标记
    title_rank = 0
    for i in range(6, 0, -1):
        if input_[:i] == '#' * i:
            title_rank = i
            break
    else:
        return 0
    input_ = input_.replace('**', '')
    # 记录数据库
    FileContent.create_content(name, json.dumps([title_rank, input_[title_rank:]]), 4, version, id_,
                               db_session=db_session)
    return 1


def test_normal(input_, name, version, db_session, id_):
    # 匹配空行
    input_bold = ''
    if input_ == '\n':
        # FileContent.create_content(name, input_, 1, version, id_, db_session=db_session)
        return
    if '``' in input_:
        pass
    else:
        # 匹配加粗,并给加粗加上strong标签,前端懒,后端加上
        test_pattern = re.compile(r'(.*)\*\*(.*)\*\*(.*)')
        test_match = test_pattern.match(input_)
        pattern = re.compile(r'(.*?)\*\*(.*?)\*\*(.*?)')
        match = pattern.finditer(input_)

        if match:
            try:
                for i in match:
                    a = i.group(1)
                    b = i.group(2)
                    c = i.group(3)
                    input_bold += a + '' + b + '' + c
                # 拼接
                input_bold += test_match.group(3)
                input_ = input_bold
            except:
                pass
        input_last = input_.replace('	', '')
        FileContent.create_content(name, input_last, 2, version, id_, db_session=db_session)


# 匹配代码块
def test_code(input_, is_code, name, version, db_session, id_, mark, content_c=None):
    # 匹配代码块
    if not content_c:
        content_c = []
    pattern = re.compile(r'```(.*)\n')
    a = pattern.match(input_)
    if a or is_code:
        if a and is_code:
            is_code = False
            FileContent.create_content(name, json.dumps(content_c), 3, version, id_, marks=mark, db_session=db_session)
            content_c = []
        else:
            is_code = True
            try:
                mark = a.group(1)
            except:
                pass

            if not a:
                content_c.append(input_)

        return is_code, content_c, mark
    return is_code, content_c, mark


def all_same(lst, sym):
    # 匹配分割线
    return not lst or sym * len(lst) == lst


def test_list(input_, name, version, db_session, id_):
    # 解析有序序列
    if len(input_) > 2 and input_[0].isdigit() and input_[1] == '.':
        result = input_[2:]
        FileContent.create_content(name, result, 8, version, id_, db_session=db_session)
        return 1

    # 分析分割线标记
    if len(input_) > 2 and all_same(input_[:-1], '-') and input_[-1] == '\n':
        FileContent.create_content(name, input_, 10, version, id_, db_session=db_session)
        return 1

    # 解析无序列表
    if input_ != "" and input_[0] in ['+', '-']:
        result = input_[1:]
        FileContent.create_content(name, result, 9, version, id_, db_session=db_session)
        return 1
    return 0


def test_table(input_, name, version, db_session, id_, is_t, content=None):
    # 解析表格
    if not content:
        content = defaultdict(list)
    pattern = re.compile(r'^(.*)\|(.+)$')
    match = pattern.match(input_)
    if match:
        l = input_.split('|')
        l[-1] = l[-1][:-1]
        # 将空字符弹出列表
        if l[0] == '':
            l.pop(0)
        if l[-1] == '':
            l.pop(-1)
        if '--' in l[0]:
            return is_t, content
        if not is_t:
            content['th'].append(l)
        else:
            content['td'].append(l)
        is_t = True
    else:
        is_t = False

        if not is_t and content:
            FileContent.create_content(name, json.dumps(content), 5, version, id_, db_session=db_session)
            content = None
    return is_t, content


# 处理链接
def test_link(s, name, version, db_session, id_):
    # 超链接
    pattern = re.compile(r'(.*)\[(.*?)\]\((.*?)\)')
    match = pattern.finditer(s)
    for a in match:
        if a:
            text, url = a.group(1, 2)
            FileContent.create_content(name, json.dumps([text, url]), 7, version, id_, db_session=db_session)
            return 1

    # 图像链接
    pattern = re.compile(r'^!\[(.*)\]\((.*)\)')
    match = pattern.finditer(s)
    for a in match:
        if a:
            text, url = a.group(1, 2)
            FileContent.create_content(name, json.dumps([text, url]), 6, version, id_, db_session=db_session)
            return 1

    return 0

其中is_c 或者is_t 或者is_code之类的,是因为代码块表格是一个多行标记,而解析的时候是按行解析,这样,就必须给代码块或者表格加上一个标记,标记这是一个列表,和一个代码块。

你可能感兴趣的:(Python)