机器学习 数据预处理之数据打标签

工作内容:

1.读取pdf文档内容

2.分页显示

3.每个数据后设置下拉框供手动打标签

4.数据录入txt文档

代码:

import math
import os
import tkinter as tk
from tkinter import *
import tkinter.ttk as ttk
from tkinter.messagebox import *
import time
import pdfplumber as pp


# request:pip install pdfplumber

class GUI(object):
    def __init__(self):
        print('begin time:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        self.resume_kinds = ('基础资料', '教育经历', '校园经历', '项目经历', '工作经历', '专业技能', '自我评价', '求职意向', '其他')
        self.file_list = [i for i in os.listdir(r'D:\hk\微信资料\OA测试简历\OA测试简历') if i.endswith('.pdf')]
        self.content, self.page_num = [], 0
        self.lb_text, self.cmb = [], []

        # 主窗口
        self.root = Tk()
        self.root.title('简历打标签')
        self.root.geometry("680x620")

        # 分页栏
        self.tab_main = ttk.Notebook()
        self.tab_main.place(relx=0.05, rely=0.1, relwidth=0.9, relheight=0.8)

        # 文件下拉框
        self.cmb_files = ttk.Combobox(self.root, state='readonly')
        self.cmb_files['value'] = self.file_list
        self.cmb_files.current(0)
        self.cmb_files.place(relx=0.3, rely=0, relwidth=0.3, relheight=0.05)

        # 文件选择提取按钮
        self.extract_butt = Button(self.root, text='提取', command=lambda: extract_file(self.cmb_files.get()))
        self.extract_butt.place(relx=0.61, rely=0, relwidth=0.05, relheight=0.05)

        # 数据录入按钮
        self.writein_butt = Button(self.root, text='录入', command=lambda: write_in())
        self.writein_butt.place(relx=0.5, rely=0.91, relwidth=0.05, relheight=0.05)

        # 文件选择提取按钮响应函数
        def extract_file(file):
            if file.endswith('.pdf'):
                self.content, self.page_num = [], 0
                self.lb_text, self.cmb = [], []
                # 获取文本内容
                path = 'D:\\hk\\微信资料\\OA测试简历\\OA测试简历\\' + file
                print('path:', path)
                pdf = pp.open(path)
                pdf.metadata
                pages = pdf.pages
                for i in pages:
                    text = i.extract_text()
                    lines = text.splitlines()
                    self.content += lines
                print(len(self.content))
                self.page_num = math.ceil(len(self.content) / 21)
                print('page_num:', self.page_num)

                self.tab_main.destroy()
                self.tab_main = ttk.Notebook()
                self.tab_main.place(relx=0.05, rely=0.05, relwidth=0.9, relheight=0.85)
                for i in range(self.page_num):
                    tab = Frame(self.tab_main)
                    tab.pack()

                    labels_area = Frame(tab)
                    cmbs_area = Frame(tab)
                    labels_area.place(relx=0, rely=0, relwidth=0.9, relheight=1)
                    cmbs_area.place(relx=0.9, rely=0, relwidth=0.1, relheight=1)

                    self.tab_main.add(tab, text='%i' % (i + 1))
                    for j in range(21):
                        if i * 21 + j < len(self.content):
                            lb = Label(labels_area, text=self.content[i * 21 + j])
                            # lb.place(relx=0,rely=j/25,relwidth=1,relheight=1/25)
                            lb.pack(anchor=E)
                            self.lb_text.append(self.content[i * 21 + j].replace(' ',''))

                            self.cmb.append(ttk.Combobox(cmbs_area, state='readonly'))
                            self.cmb[-1]['value'] = self.resume_kinds
                            self.cmb[-1].current(0)
                            self.cmb[-1].pack()

        # 数据录入按钮响应函数
        def write_in():
            confirm = askyesno('提示框', '是否录入数据?(此操作会影响文本录入信息)')
            if confirm:
                with open('data.txt', 'a', encoding='utf-8') as f:
                    for i in range(len(self.lb_text)):
                        f.write(self.cmb[i].get() + '     ' + self.lb_text[i] + '\n')
                    print('录入成功')
                    print('finish time:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        self.root.mainloop()


if __name__ == '__main__':
    gui = GUI()

界面展示:

机器学习 数据预处理之数据打标签_第1张图片

 

你可能感兴趣的:(python,数据预处理,打标签,python,机器学习)