# encoding: utf-8
# 输入的信息是已经解析为json格式的简历信息集 这个可以用pdfString文件执行这一部分的逻辑

# Assumptions: 在解析模块能够准确获得用户的姓名 对应岗位 和 简历正文

import pandas as pd
import numpy  as np
import jieba
import json
import pickle

from   collections import Counter

import os
import requests
import sys
import re

# self written
import infoextract
import pdfString
from data import Reference
import Try02

# sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from transformers import BertTokenizer, BertModel
import torch
import logging

# This file Asks Json in form [{},{},...,{}]

# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data\\Result.csv"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Auto\\data"
INFOLIST = ["name", "infotext", "jobs"]

# 机器学习实体
tfidf   = TfidfTransformer()
counter = CountVectorizer(analyzer = 'word')

# 输入简历集
def inputSource (sourcePath):
        filename = sourcePath.split("\\")[-1]
        if   (".json" in filename):
                targetDS  = pd.read_json(sourcePath, encoding = "utf-8")
                targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")
        elif (".csv" in filename):
                targetDS = pd.read_csv(sourcePath, encoding = "utf-8")
        return  targetDS

class preExtractor (object):
        # 初始化预解析器
        def __init__ (self, sourceText, filename):
                self.fullText = sourceText
                self.file_dir = filename
                # Extract Information
                ansinfo = infoextract.Extractor(file_dir = self.file_dir, file_text = self.fullText, switch = 1).search()
       = { 
                        "name":         ansinfo["user_name"], 
                        "infotext":     self.textCut (content = self.fullText),
                        "jobs":         ansinfo["jobs"]
        def textCut (self, content):
                src = self.textwasher(text = content, quit_universal = True), 
                seg = list(jieba.cut(str(src[0]).strip()))
                # 去除停用词
                seg = function.removeStopword(seg)
                # 去除纯数字
                seg = list(filter(lambda x: not str(x).isdigit(), seg))
                return seg

        # 中文简历文本清洗和去除停用词
        def textwasher (self, text, quit_universal):
                # 去除分行 去除关于友邦保险所额外添加的信息项 quit_universal == True
                if (quit_universal):
                        textL = text.split('\n')
                        count = 0
                        while (count < 7):
                                count += 1
                        text0 = " ".join(textL[:-5])
                # 文本清洗
                # import spacy
                pattern1 = '[’!"#$%&\'()*+,-./:::;<=>?@[\\]^_`{|}~]+'
                pattern2 = '\\s+'
                pattern3 = r'[\n|\u3000|\s*$]'
                pattern4 = re.compile(u'[^\s1234567890::' + '\u4e00-\u9fa5' + 'a-zA-Z]+')
                text1 = re.sub(pattern1 + pattern2, '', text0)
                text2 = re.sub(pattern3, '', text1)
                text3 = re.sub(pattern4, '', text2)
                return text3

class function ():
        # 对数据集进行解析
        def extractDS (targetDS):
                for key in INFOLIST:
                        targetDS[key] = targetDS["Text"].apply(lambda x: "")
                for index, row in targetDS.iterrows():
                        ansinfo = preExtractor(sourceText = row["Text"], filename = row["File_name"]).info
                        for key in ansinfo:
                                targetDS.loc[index, key] = str(ansinfo[key])

        # 从网站上导入停用词
        def getSiteStopword ():
                if not os.path.exists('data/stopWord.json'):
                        stopWord = requests.get(STOPWORDSITE)
                        with open("data/stopWord.json", "wb") as f:
                        with open("data/stopWord.json", "r") as f:
                                stopWord.STOPLIST +="\n")
        # 去除停用词
        def removeStopword (wordList):
                filteredWords = [word for word in wordList if word not in Reference.STOPWORDLIST]
                return filteredWords
        # 职业分类
        def classifyJobs (position):
                result = "others"
                pattern1 = re.compile(u'[^\s1234567890::' + 'a-zA-Z]+')
                position1 = re.sub(pattern1, "", position)
                if len(position1) > 2:
                        result = position1
                        result = position
                jobsDict = Reference.JOBS_TYPE_DICT
                for key in jobsDict:
                        if key in result:
                                result = jobsDict[key]
                if isinstance(result, str) == True:
                        # return 0
                        return 4 # 现在让不知所云者当BA
                return result

class   textVary (object):
        def tf_idf_regression (trainL, testL, y_trainL, y_testL):
                # Setup tfidf model
                info_train1 = [' '.join(i) for i in trainL]
                info_test1  = [' '.join(i) for i in testL]
                tfidf_train = tfidf.fit_transform(counter.fit_transform(info_train1))
                tfidf_test  = tfidf.fit_transform(counter.transform(info_test1))
                print(tfidf_train.shape, tfidf_test.shape)
                # Train tfidf model
                param_grid = {
                        'C': [0.01, 0.1, 1.0, 2.0, 10, 100], 
                        'penalty' : ['l2']
                        # 'penalty' : ['l1', 'l2']
                clf = LogisticRegression()
                grid_search = GridSearchCV (
                        estimator = clf,
                        param_grid = param_grid,
                        scoring = 'accuracy',
                        cv = 5,
                        n_jobs = -1

       (tfidf_train, y_trainL)
                lr_best = LogisticRegression(penalty='l2',C=2)
      , y_trainL)
                tf_idf_y_pred = lr_best.predict(tfidf_test)
                # print(tf_idf_y_pred)
                print('TF-IDF LR test accuracy %s' % metrics.accuracy_score(y_testL, tf_idf_y_pred))
                print('TF-IDF LR test F1_score %s' % metrics.f1_score(y_testL, tf_idf_y_pred, average="macro"))
                return lr_best
        def word2vec_regression (trainL, testL, y_trainL, y_testL):
                model = KeyedVectors.load_word2vec_format('data/sgns.zhihu.word')
                vocabulary = model.vocab
                vec_lem = model[''].shape[0]
                grid_search = GridSearchCV( 
                        estimator = clf,
                        param_grid = param
        def bert_regression (trainL, testL, y_trainL, y_testL):
                # Set-up basic Information
                gpu = 0
                use_cuda = gpu >= 0 and torch.cuda.is_available()
                if use_cuda:
                        device = torch.device("cuda", gpu)
                        device = torch.device("cpu")
      "Use cuda: %s, gpu id: %d.", use_cuda, gpu)
                bert_model_dir = 'bert-mini'
                tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
                Bertmodel = BertModel.from_pretrained(bert_model_dir)
                word = ['今天我是一个大笨蛋']
                input_id = tokenizer(word, padding = True, truncation = True, max_length = 0, return_tensors = 'pt')
                result = Bertmodel(input_id['input_ids'])
                vec_len = len(result[0][0][1])
def train_Model (model):
        # Step 01
        # 输入信息
        data = inputSource (sourcePath = FILEPATH)
        function.extractDS (targetDS = data)
        ProcessData = data[['name', 'infotext', 'jobs']]
        # ProcessData['Type'] = ProcessData['jobs'].apply(lambda x: 0)
        count = 0
        rowSize = len(ProcessData)
        ProcessData.insert(loc = len(ProcessData.columns), column = 'Type', value = [0 for i in range(rowSize)])
        while (count < rowSize):
                ProcessData.loc[count, 'Type'] = function.classifyJobs(ProcessData.loc[count, 'jobs'])
      [count, 'infotext'] = eval(ProcessData.loc[count, 'infotext'])
                count += 1
        # 下面这个是正确的 但是上面的赋值会受到排序不一的干扰 看有没有办法解决
        # ProcessData['infotext'] = ProcessData['infotext'].apply(lambda x: eval(x))
        # print(ProcessData)
        # Sorted by job types
        # SortedProcessData = ProcessData.sort_values('Type')
        for index, row in data.iterrows():
                print(SortedProcessData['jobs'][index] + "  " + str(ProcessData['Type'][index]))
        AdminData = ProcessData[ProcessData.Type == 3]
        # Step 02
        # 这样我们应该就可以实现分类计算词频了 开始训练 划分训练集和测试集 这些参数可以进行调节
        X_Set = ProcessData['infotext'] # X info
        Y_Set = ProcessData['Type']     # Y type
        test_ratio = 0.2
        x_train, x_test, y_train, y_test = train_test_split (X_Set, Y_Set, test_size = test_ratio, random_state = 0)
        print("See Results\n")
        print(x_train.head(), y_train.head())
        if (model == "tfidf"):
                fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
                f  = open('models/tfidf_model1.pkl', 'wb')
                pickle.dump(fn, f)
        if (model == "bert"):
                fn = textVary.bert_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
                f  = open('models/bert_model1.pkl', 'wb')
                pickle.dump(fn, f)
        fn = textVary.tf_idf_regression (trainL = x_train, testL = x_test, y_trainL = y_train, y_testL = y_test)
        return fn
def recommend_Resume (targetPDF):
        targetFile = pdfString.Transformer(file_dir = targetPDF, quitD = 1).info
        targetInfo = preExtractor(sourceText = targetFile["Text"], filename = targetPDF).info
        targetText = eval(str(targetInfo["infotext"]))
        # 训练模型
        train_Model(model = "tfidf")
        # train_Model(model = "bert")
        # 调用模型
        f  = open('models/tfidf_model1.pkl', 'rb')
        fn = pickle.load(f)
        try1 = [' '.join(targetText)]
        # print(try1)
        # print("True: " + str(ProcessData.loc['Type']) + "\n" + ProcessData.loc[num, 'jobs'])
        # print("True: " + function.classifyJobs(targetInfo['jobs']) + "\n" + targetInfo['jobs'])

        tfidf_try1 = tfidf.fit_transform(counter.transform(try1))
        try1_pred  = fn.predict(tfidf_try1)
        print("PREDICT: " + str(try1_pred) + ": " + Reference.JOB_RECOMMENDATION[try1_pred[0]])
        # tf_idf 向量化
# Main Function
if __name__ == "__main__":
        # 测试一下这个训练结果
        targetPDF = "Kenny.pdf"
        recommend_Resume (targetPDF = targetPDF)
        ansinfo = Try02.Extractor(file_dir = targetPDF).search()
        Try02.Generator(sourceInfo = ansinfo).display()
# coding:utf-8
# 目前还缺乏研究 如果有多个专业应该怎么处理
# 多种方式比对

# 信息 先分块 后解析 准确率和效率提升

import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys
# import provinces

# PowerBi dataverse
BASE_URL  = ""

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\CV_Automation\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

# 参考集 字典

# 个人筛选的 200 个常用姓氏 生成字典
Surname_List = [

Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}
# 专业
Major_List = [

# 技能
Skillset_List = [
        'Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目'

# 地点
Location_List = [
        '成都', '广州'

# 来源
Vendor_List = [
        '猎聘', '智联', '前程', '领英', '51'

# 子函数
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        # 读取文件目录
        def __init__ (self, file_dir):
                self.fullWord = []
                self.fullText = ""
                self.file_dir = file_dir 
                if os.path.splitext(self.file_dir)[1] == ".pdf":
                        pdf =
                for page in pdf.pages:
                        self.fullWord += page.extract_words()
                        self.fullText += page.extract_text() if page.extract_text() else ""
        # 功能函数 读取一个段落知道某一行的长度只有不到4位中文字符
        def __readUntil (text, length):
                return ""
        # 必要部分:姓名 应聘职位 专业 联系电话 附件下载 来源 性别
        # 01 搜索姓名函数  Name
        def __search_Name (self):
                result = ""
                names = []
                full_text = self.fullText
                # 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的 
                dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )
                if (len(dir_Set) > 0):
                        for TempDir in dir_Set: 
                                if (TempDir[0] in Surname_List):
                                        return TempDir

                # 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历
                for line in full_text.split("\n"):

                        # 是否在姓名字段下
                        if"姓[ ]+名", line):
                                name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]
                                names.append(re.sub(r"[姓名::\s]", "", name))
                        # 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇
                                regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'
                                nameset = re.findall (regex_str, line)
                                if len(nameset): return nameset[0]
                                names += nameset                     

                # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个
                for TmpName in names: 
                        if (TmpName[0] in Surname_List):
                                result = TmpName
                                return result   
                return result
        # 02 搜索应聘职位  Jobs
        def __search_Jobs (self):
                result = ""
                jobs = []
                full_text = self.fullText
                JobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]
                for line in full_text.split("\n"):
                        # 是否在职位字段下
                        if any(title in line for title in JobTitle_List):
                                for title in JobTitle_List:
                                        if, line):
                                                job_List = re.findall(r"\s*" + title + "[::\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)
                                                if (len(job_List) > 0): 
                                                        job = job_List[0]
                                                        job = re.sub(title + r"[::*\s]", "", job)
                                                        job = re.sub("\s", "", job)
                                                        return job
                return ""
        # 03 搜索专业函数  Major
        def __search_Major (self):
                majors = []
                major = ""  
                result = ""
                full_text   = self.fullText
                full_words  = self.fullWord
                # 在专业字段中寻找  
                for line in full_text.split("\n"):
                        # 51 job        
                        if"专[ ]+业*", line):
                                majorList = re.findall(r"专[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[专业::\s]", "", major))
                        # 猎聘通
                        if"\s*行[ ]+业*", line):
                                majorList = re.findall(r"\s*行[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[行业::\s]", "", major))
                        for premajor in Major_List:
                                if premajor in line:
                                        return premajor
                if (len(majors) > 0): 
                        if (len(majors[0]) > 0):
                                return majors[0]
                # 在正文部分中寻找 带有专业或者系的字段
                for word in full_words:
                        text = ""
                        textMajor = ""
                        if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]
                        else: text = word
                        # 中文专业 尴尬的事情是扫码联系
                        if "专业" or "系" or "技术" in text:
                                for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):
                                        if "专业" or "系" or "技术" in m:
                                                textMajor = m;
                                if textMajor != "": break
                        # 英文专业 这一部分还需要修改
                        elif "Bsc" or "Major" or "Msc" in text:
                                for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):
                                        if "Bsc" or "Major" or "Msc" in m:
                                                textMajor = m;
                                if textMajor != "": break
                if len(majors) > 0: 
                        for m in majors:
                                if (len(m) == 0): continue
                                result = m
                return result  
        # 04 搜索电话信息  
                # Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下
        def __search_Phone (self):
                # 找到含有11位数字的字符串段
                full_text = self.fullText
                phone   = ""
                number  = ""
                number_List = []
                # 通过关键词查找  去除空格和短横线后 前后的小括号 读取 11 13 14 个连续的数字
                for line in full_text.split("\n"):
                        if"电\s*话", line) or"手\s*机", line):
                                # 去除标点符号
                                line = re.sub(r"[()()::+\-]", "", line)
                                # 选择 11 到 15 位长度的数字
                                number_List = re.findall(r"\d{11,15}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number                    
                # 直接通过数字长度查找 返回符合要求的集合
                        if phone == "":
                                text   = re.sub(r"[()()+\-]", "", full_text)
                                phones = re.findall(r"\d{11,15}", text)
                                phone  = ",".join(set(phones))
                return phone
        # 06 确认来源信息  Vendor
        def __search_Vendor (self):
                directory = self.file_dir
                full_text = self.fullText
                # 在目录中寻找
                for vendor in Vendor_List:
                        if vendor in self.file_dir:
                                return vendor

                # 在字段中寻找
                count = 0
                for line in full_text.split("\n"):
                        if (count > 20): break
                        for vendor in Vendor_List:
                                if vendor in line: return vendor
                        count += 1

                return ""
        # 07 搜索性别函数  Gender 没写男女就只能通过照片去判断
        def __search_Gender (self):
                gender = "" 
                full_text  = self.fullText
                full_words  = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 性别字段
                        if"性[ ]+别*", line):
                                gender = re.findall(r"性[ ]+别[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0]
                        # 识别到男性字段
                        if"男", line) or"Male", line): 
                                gender = "男"
                                return gender
                        # 识别到女性字段
                        if"女", line) or"Female", line): 
                                gender = "女"
                                return gender
                        counter += 1                                
                return gender
        # 可选部分: 
        # 08 搜索年龄函数  Age
        def __search_Age (self):
                Curr_Year =
                number = ""
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 获取出生年月
                        if"出生年月", line):
                                number_List = re.findall(r"\d{4,4}", line)
                                if (len(number_List) > 0): number = number_List[0]
                                Age = Curr_Year - int(number)
                                return str(Age) 
                        # 获取岁
                        if"\s*岁", line):
                                number_List = re.findall(r"\d{1,2}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number  
                return ""
        # 09 判断在职状态  Condition
        def __search_Condition (self):
                full_text  = self.fullText
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if"离职", line): return "离职"
                        if"正在找工作", line): return "正在找工作"
                        if"在职", line): return "在职"
                        counter += 1                                
                return ""
        # 10 搜索城市函数  Cities
        def __search_City (self):
                locations = []
                location = ""  
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        if"\s*地点", line):
                                loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点::\s]", "", location))
                        if"所在地", line) or"现居地", line):
                                loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地::\s]", "", location))
                        if"住\s*址", line) or"现居住", line)  or"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址::\s]", "", location))
                                # 现居住
                                loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住::\s]", "", location))
                                # Location
                                loc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location::\s]", "", location))
                if (len(locations) > 0): location = locations[0]
                return location
        # 11 搜索学历函数  Stage
        def __search_Stage (self):
                stage = ""  
                full_text  = self.fullText
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        if (any (TempStr in line for TempStr in setSta)):
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                if stage == "": return "专科"                    
                return stage
        # 12 搜索籍贯函数  Hometown
        def __search_Hometown (self):
                hometown    = "" 
                full_text   = self.fullText
                full_words  = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 籍贯字段
                        if"籍[ ]+贯*", line):
                                hometown = re.findall(r"籍[ ]+贯[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0] 
                return hometown
        # 13 搜索自我评价函数  Self-Comment
        def __search_SelfComment (self):
                selfie    = "" 
                full_text = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                turn = False
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        # if (counter < 10): continue

                        # 籍贯字段
                        if  (r"自我评价", line):
                                turn = True
                                print ("Yes" + self.file_dir)
                        if (turn == True) and (len(line) > 10):
                                print (line + "\n")
                return ""
        # 14 搜索工作经验函数   Working Experience
        def __search_WorkExperience (self):
                return ""
        # 15 搜索教育经历函数   Education Experience
        def __search_EducationExperience (self):
                return ""
        # 16 搜索学校函数       School
        def __search_School (self):
                # 这个顺序有讲究的 一般 大学 校区 学院
                College_signs = ["大学", "校区", "学院"]
                Note_signs = ["毕业院校"]
                school = ""  
                school_list = []
                full_text  = self.fullText
                punctuation = '::|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if, line):
                                        school_list += re.findall(r"[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                        # 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分
                        for term in College_signs:
                                if"\s*"+term, line):
                                        line = re.sub('[{}]'.format(punctuation), " ", line)
                                        school_list += re.findall(r"\S{2,10}"+term, line)
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(school_list) > 0):
                                school = re.sub(r"\s", "", school_list[-1])
                                return school
                return ""
        # 17 搜索证书函数       Certificate
        def __search_Certificate (self):
                return ""

        # 18 搜索专业技能函数   Skill-Set
        def __search_ProfessionalSkills (self):
                return ""
        # 19 搜索期望薪资函数   Expected Salaries
        def __search_Salary (self):

                salary = ""
                Note_signs = ["期望薪资"]
                salary_list = []
                full_text  = self.fullText
                punctuation = '::|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if, line):
                                        school_list += re.findall(r"[::\s]*\S{2,10}", line)
                        # 关键在 - 左右两边对称 多少到多少
                        if"/月", line):
                                salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(salary_list) > 0):
                                salary = re.sub("万", "0000", salary_list[-1])
                                salary = re.sub(r"[\s ]", "", salary_list[-1])
                                return salary
                return ""
        # 20 搜索工作年限函数   Working Stages
        def __search_WorkYears (self):
                return ""
        # 21 搜索区号函数
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        if"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                if email != "": break
                return email

        # 搜索技能函数  Search Skills
        def __search_Skill (self):
                Skills = []
                skill  = ""
                full_text  = self.fullText
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if, line) and (key == ""):
                                        key = "Added"
                return Skills
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-1]
                file_name = sep_dir[-1]
                if len(sep_dir) > 1:
                        directory = sep_dir[-2]
                info = {
                        "Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""
                func = {
                        "user_name":    self.__search_Name(),           # 姓名
                        "jobs":         self.__search_Jobs(),           # 应聘职位
                        "major":        self.__search_Major(),          # 专业
                        "phone":        self.__search_Phone(),          # 电话
                        5:              directory,                      # 附件
                        "vendor":       self.__search_Vendor(),         # 来源
                        "gender":       self.__search_Gender(),         # 性别
                        "age":          self.__search_Age(),            # 年龄
                        "condition":    self.__search_Condition(),      # 状态
                        "city":         self.__search_City(),           # 现居地
                        "stage":        self.__search_Stage(),          # 学历
                        "hometown":     self.__search_Hometown(),       # 籍贯
                        "selfComment":  self.__search_SelfComment(),    # 自我评价
                        14:     "",
                        15:     "",
                        "school":       self.__search_School(),         # 学校
                        17:     "",
                        18:     "",
                        "salary":       self.__search_Salary(),
                        20:     "",
                        21:     "",
                        "email":        self.__search_Email(),          # 邮箱
                        23:     "",
                        "skill":        self.__search_Skill(),          # 技能

                for key in info:
                        if (key == "Directory") or (key == "file_name"): continue
                                info[key] = func[key]
                        except Exception as e: 
                return info

# 猎聘
# class Lie-Pin (object):

# 智联
# class Zhi-Lian (object):
# 前程无忧
# class Qian-Cheng (object):

# 51jobs
# class Jobs (object):

# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                return ResumePath

# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
       = sourceInfo
        # 打印呈现
        def display (self):
                result =
                print("################### Candidate ###################")
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position :", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                print("Hometown : ", result["hometown"])
                print("School   : ", result["school"])
                print("Salary   : ", result["salary"])
                # print("SkillSet : ", "\n".join(result["skill"]))
        # 生成 Json
        def generate_Json (self):
                        data_Json = json.dumps (, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        # 主要函数
        def process (self):
                sourceFile = "TestJson.json"
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

# 杂项函数
class function:
        # 呈现百分比
        def displayPercent (counter, total, turn):
                assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

                # 常规 display, turn == True
                if (turn):
                        percent = float(counter)*100 / float(total)
                # 最终 display, turn == False
                return ""
        # Json 初始化
        def initiateJson (filename):
                assert (isinstance(filename, str) and (".json" in filename))
                Json_file = open(filename, 'w', encoding = 'utf-8')
             # 定位到 Position 0
                Json_file.truncate()    # 清空 Json 文件
                return Json_file

# 主函数
if __name__ == "__main__":
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        ResumeInfoList = []
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter   = 0
        Json_file = function.initiateJson ("resume_Result.json")
        total     = len(PdfResumePath)
        #         导出简历信息
        for file in PdfResumePath:
                counter = counter + 1
                # if (counter > 2): continue
                ResumeInfoList.append (Extractor(file_dir = file).search())
                function.displayPercent (counter, total, True)
        function.displayPercent (counter, total, False)
        counter = 0
        length = len(ResumeInfoList)
        #         将信息呈现并写入json
        for info in ResumeInfoList:
                counter = counter + 1
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                if (counter != length): Json_file.write(",")
        # 复制到仓库中
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()


# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if"离职", line): return "离职"
                        if"正在找工作", line): return "正在找工作"
                        if"在职", line): return "在职"
                        counter += 1                                
                return ""
        # 10 搜索城市函数  Cities
        def __search_City (self):
                locations = []
                location = ""  
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        if"\s*地点", line):
                                loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点::\s]", "", location))
                        if"所在地", line) or"现居地", line):
                                loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地::\s]", "", location))
                        if"住\s*址", line) or"现居住", line)  or"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址::\s]", "", location))
                                # 现居住
                                loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住::\s]", "", location))
                                # Location
                                loc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location::\s]", "", location))
                if (len(locations) > 0): location = locations[0]
                return location
        # 11 搜索学历函数  Stage
        def __search_Stage (self):
                stage = ""  
                full_text  = self.fullText
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        if (any (TempStr in line for TempStr in setSta)):
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                if stage == "": return "专科"                    
                return stage
        # 12 搜索籍贯函数  Hometown
        def __search_Hometown (self):
                hometown    = "" 
                full_text   = self.fullText
                full_words  = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 籍贯字段
                        if"籍[ ]+贯*", line):
                                hometown = re.findall(r"籍[ ]+贯[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0] 
                return hometown
        # 13 搜索自我评价函数  Self-Comment
        def __search_SelfComment (self):
                selfie    = "" 
                full_text = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                turn = False
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        # if (counter < 10): continue

                        # 籍贯字段
                        if  (r"自我评价", line):
                                turn = True
                                print ("Yes" + self.file_dir)
                        if (turn == True) and (len(line) > 10):
                                print (line + "\n")
                return ""
        # 14 搜索工作经验函数   Working Experience
        def __search_WorkExperience (self):
                return ""
        # 15 搜索教育经历函数   Education Experience
        def __search_EducationExperience (self):
                return ""
        # 16 搜索学校函数       School
        def __search_School (self):
                # 这个顺序有讲究的 一般 大学 校区 学院
                College_signs = ["大学", "校区", "学院"]
                Note_signs = ["毕业院校"]
                school = ""  
                school_list = []
                full_text  = self.fullText
                punctuation = '::|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if, line):
                                        school_list += re.findall(r"[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                        # 看看这行有无关键词 有就加入 用\S避免字符不能识别 先把标点符号替换以区分
                        for term in College_signs:
                                if"\s*"+term, line):
                                        line = re.sub('[{}]'.format(punctuation), " ", line)
                                        school_list += re.findall(r"\S{2,10}"+term, line)
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(school_list) > 0):
                                school = re.sub(r"\s", "", school_list[-1])
                                return school
                return ""
        # 17 搜索证书函数       Certificate
        def __search_Certificate (self):
                return ""

        # 18 搜索专业技能函数   Skill-Set
        def __search_ProfessionalSkills (self):
                return ""
        # 19 搜索期望薪资函数   Expected Salaries
        def __search_Salary (self):

                salary = ""
                Note_signs = ["期望薪资"]
                salary_list = []
                full_text  = self.fullText
                punctuation = '::|-'

                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 查看是否有相匹配的节点
                        for term in Note_signs:
                                if, line):
                                        school_list += re.findall(r"[::\s]*\S{2,10}", line)
                        # 关键在 - 左右两边对称 多少到多少
                        if"/月", line):
                                salary_list += re.findall(r"[0-9\.\s 万]{1,10}-[0-9\.\s 万]{1,10}", line)
                        # 注意到学院会在大学后面 所以有多个输入最全的一个 ⼤ 大
                        if (len(salary_list) > 0):
                                salary = re.sub("万", "0000", salary_list[-1])
                                salary = re.sub(r"[\s ]", "", salary_list[-1])
                                return salary
                return ""
        # 20 搜索工作年限函数   Working Stages
        def __search_WorkYears (self):
                return ""
        # 21 搜索区号函数
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        if"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                if email != "": break
                return email

        # 搜索技能函数  Search Skills
        def __search_Skill (self):
                Skills = []
                skill  = ""
                full_text  = self.fullText
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if, line) and (key == ""):
                                        key = "Added"
                return Skills
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-2]
                file_name = sep_dir[-1]
                info = {
                        "Directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": "", "hometown": "", "school": "", "salary": "", "selfComment": ""
                func = {
                        "user_name":    self.__search_Name(),           # 姓名
                        "jobs":         self.__search_Jobs(),           # 应聘职位
                        "major":        self.__search_Major(),          # 专业
                        "phone":        self.__search_Phone(),          # 电话
                        5:              directory,                      # 附件
                        "vendor":       self.__search_Vendor(),         # 来源
                        "gender":       self.__search_Gender(),         # 性别
                        "age":          self.__search_Age(),            # 年龄
                        "condition":    self.__search_Condition(),      # 状态
                        "city":         self.__search_City(),           # 现居地
                        "stage":        self.__search_Stage(),          # 学历
                        "hometown":     self.__search_Hometown(),       # 籍贯
                        "selfComment":  self.__search_SelfComment(),    # 自我评价
                        14:     "",
                        15:     "",
                        "school":       self.__search_School(),         # 学校
                        17:     "",
                        18:     "",
                        "salary":       self.__search_Salary(),
                        20:     "",
                        21:     "",
                        "email":        self.__search_Email(),          # 邮箱
                        23:     "",
                        "skill":        self.__search_Skill(),          # 技能

                for key in info:
                        if (key == "Directory") or (key == "file_name"): continue
                                info[key] = func[key]
                        except Exception as e: 
                return info

# 猎聘
# class Lie-Pin (object):

# 智联
# class Zhi-Lian (object):
# 前程无忧
# class Qian-Cheng (object):

# 51jobs
# class Jobs (object):

# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                return ResumePath

# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
       = sourceInfo
        # 打印呈现
        def display (self):
                result =
                print("################### Candidate ", counter, " ###################")
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position :", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                print("Hometown : ", result["hometown"])
                print("School   : ", result["school"])
                print("Salary   : ", result["salary"])
                # print("SkillSet : ", "\n".join(result["skill"]))
        # 生成 Json
        def generate_Json (self):
                        data_Json = json.dumps (, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        # 主要函数
        def process (self):
                sourceFile = "TestJson.json"
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

# 杂项函数
class function:
        # 呈现百分比
        def displayPercent (counter, total, turn):
                assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

                # 常规 display, turn == True
                if (turn):
                        percent = float(counter)*100 / float(total)
                # 最终 display, turn == False
                return ""
        # Json 初始化
        def initiateJson (filename):
                assert (isinstance(filename, str) and (".json" in filename))
                Json_file = open(filename, 'w', encoding = 'utf-8')
             # 定位到 Position 0
                Json_file.truncate()    # 清空 Json 文件
                return Json_file

# 主函数
if __name__ == "__main__":
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        ResumeInfoList = []
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter   = 0
        Json_file = function.initiateJson ("resume_Result.json")
        total     = len(PdfResumePath)
        #         导出简历信息
        for file in PdfResumePath:
                counter = counter + 1
                # if (counter > 2): continue
                ResumeInfoList.append (Extractor(file_dir = file).search())
                function.displayPercent (counter, total, True)
        function.displayPercent (counter, total, False)
        counter = 0
        length = len(ResumeInfoList)
        #         将信息呈现并写入json
        for info in ResumeInfoList:
                counter = counter + 1
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                if (counter != length): Json_file.write(",")
        # 复制到仓库中
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()


# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)

# coding:utf-8
# 这个版本是用于简历分栏
新的思路是 我们遍历每一个 text 的内容 然后看是否读取到这个 text 的长度只有4个字长 
遍历是不是在分隔符的集合内 如果是 就在这里分割

如果用表格抽取 好像只有邓的简历可以用这样的方法

import os
import re
from   xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
import sys

from collections import OrderedDict

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf  简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

# 抽取器 抽取单个文件的信息
class Extractor (object):  
    # 读取文件信息 输出的 ansList 将我们分解出来的段落放置在一个数组中记录起来
    def __init__ (self, file_dir):
        self.fullWord = []
        self.fullText = ""
        self.file_dir = file_dir 
        self.ansList  = []
        if os.path.splitext(self.file_dir)[1] == ".pdf":
            pdf =
        for page in pdf.pages:
            # 读取词汇
            self.fullWord += page.extract_words()
            # 读取文本信息
            self.fullText += page.extract_text() if page.extract_text() else ""
            # 读取表格
    # 切片函数
    def slide (self):
        pattern_list = ["信息", "评价", "经历", "经验", "信息", "技能", "意向"]
        full_text = self.fullText
        source_list = full_text.split('\n')
        paragraph_list = []
        # [\u4e00-\u9fa5\S*\u4e00-\u9fa5]{4, 6}
        # 依照段落表示分段
        counter = 0
        length  = len(source_list)
        currentText  = ""
        paragraph = []
        while (counter < length):
            # 导出并去除当前句子重复的汉字部分
            line = source_list[counter]
            line = function.quitDuplicate(line)
            # print(line)
            # 判断是否符合分割条件 如果符合 则新建一个段落存储 长度小于5 并 包含关键词
            if (len(line) < 5):
                # 将前一段文本导进
                if (currentText) : paragraph.append(currentText)
                # 判断是否有分割关键词
                if (any (TempStr in line for TempStr in pattern_list)):
                    old_paragraph = paragraph
                    paragraph = []
                # 递进
                counter = counter + 1
                currentText = ""

            currentText += line + "\n"
            # paragraph.append(line)    这一步可以改为增加文本作为列表的元素
            counter = counter + 1
        # 加入最后一段
        paragraph.append (currentText)
        paragraph_list.append (paragraph)
        # 将段落列表返回
        return paragraph_list
    # 入口函数 返回搜索结果
    def search (self):
        # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
        sep_dir = re.split(r"/+|\\+", self.file_dir)
        directory = sep_dir[-2]
        file_name = sep_dir[-1]
        info = {"Directory": directory, "file_name": file_name, "para": self.ansList}
        return info
# 遍历并读取函数
class Reader (object):
    # 初始化
    def __init__ (self, folder_Path):
        self.path = folder_Path
    # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
    def read (self, type):
        ResumePath = []
        allfilelist = os.listdir(self.path)
        for file in allfilelist:
            # 生成简历文件路径 判断是否位文件
            filepath = os.path.join(FolderPath, file)
            if os.path.isfile(filepath):
                # 遍历所有符合type类型的简历
                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
        return ResumePath

# 输出生成函数
class Generator (object):
    # 初始化
    def __init__ (self, sourceInfo): = sourceInfo
    # 打印呈现
    def display (self):
        result =
        print("Length: " + (str)(len(info["para"])))
        print("################### Candidate ", counter, " ###################")
        # print paragraphs
        # print("SkillSet : ", "\n".join(result["skill"]))
# 杂项函数
class function:
    # 呈现百分比
    def displayPercent (counter, total, turn):
        assert (isinstance(counter, int) and isinstance(total, int) and isinstance(turn, bool))

        # 常规 display, turn == True
        if (turn):
            percent = float(counter)*100 / float(total)
        # 最终 display, turn == False
        return ""

    # 去除重复的字符
    def quitDuplicate (source):
        # return source
        counter = 1
        while (counter < len(source)):
            if (source[counter] == source[counter - 1]):
                # 额外需要增加的功能 是否是名字的判断
                if '\u4e00' <= source[counter] <= '\u9fff':
                    oldstr = source
                    newstr = oldstr[:counter] + "" + oldstr[counter + 1:]
                    source = newstr
            counter = counter + 1
        return source
# 主函数
if __name__ == "__main__":
    # Step 1: 遍历该文件夹下的所有简历文件        
    PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
    DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
    ResumeInfoList = []
    # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
    counter   = 0
    total     = len(PdfResumePath)
    ResumeInfoList = []
    #         导出简历信息
    for file in PdfResumePath:
        counter = counter + 1
        # if (counter > 1): continue
        ResumeInfoList.append (Extractor(file_dir = file).search())
        function.displayPercent (counter, total, True)

    function.displayPercent (counter, total, False)
    counter = 0    
    #         将信息呈现并写入json
    for info in ResumeInfoList:
        counter = counter + 1
        Generator(sourceInfo = info).display()

# coding:utf-8
# 版本04 试验对简历信息的分栏处理

import os
import re
from xml.dom.minidom import parse
import csv
import jieba
import pdfplumber as pb             
import sys        
import datetime
import pyDataverse as pd
import json
# import provinces

# PowerBi dataverse
BASE_URL  = ""

# 输入: 简历文件所在文件夹 用于遍历所有简历
FolderPath = r"C:\Alan .AIA\Python\ResumeRespo"  

PdfResumePath = []              # 符合要求的 pdf 简历文件的列表
DocxResumePath = []             # 符合要求的 docx 简历文件的列表
filename = []                   # 存储简历文件名的目录
ids = []                        # 依照顺序标记简历 id
data = []                       # 最终输出数据结构

# 参考集 字典

        # 个人筛选的 200 个常用姓氏 生成字典
        Surname_List = ['赵','钱','孙','李','周','吴','郑','王','冯','陈','褚','卫','蒋','沈','韩','杨','朱','秦','尤','许','何','吕','施','张','孔','曹','严','金','魏','陶','姜','戚','谢','邹','苏','潘','葛','奚','范','彭','郎','鲁','韦','昌','马','苗','方','俞','任','袁','柳','酆','鲍','史','唐','费','廉','岑','薛','雷','贺','倪','汤','滕','殷','罗','毕','郝','邬','安','常','乐','于','时','傅','皮','齐','康','余','卜','顾','孟','平','黄','穆','萧','尹','姚','邵','汪','祁','毛','狄','米','贝','明','臧','成','戴','宋','茅','庞','熊','纪','舒','屈','项','祝','董','梁','杜','阮','蓝','闵','席','季','麻','贾','路','娄','危','江','童','颜','郭','梅','盛','林','徐','邱','骆','高','夏','蔡','田','樊','胡','凌','霍','虞','万','柯','管','卢','莫','房','丁','宣','邓','郁','单','杭','洪','包','诸','石','崔','吉','钮','龚','程','嵇','邢','裴','陆','翁','芮','靳','松','井','段','富','焦','巴','谷','车','全','郗','班','秋','仲','伊','宁','仇','栾','甘','祖','武','符','刘','景','詹','龙','叶','幸','韶','黎','溥','庄','白']

        Surname_Dict = dict(zip(Surname_List, range(len(Surname_List)))) # 字典: {'赵':0,'钱':1,'孙':2,'李':3, ...}

        # 专业
        Major_List = ['软件工程','计算机软件','计算机硬件','互联网','通信','电信','网络资源','计算机科学与技术']

        # 技能
        Skillset_List = ['Java', 'C', 'WEB', 'SQL', 'EJB', 'Cpp', 'C#', 'dotnet', 'RPA', 'Python', 'HTML', 'Html', 'CSS', 'JavaScript', 'R', '外语', 'Office', '项目']

        # 地点
        Location_List = ['成都', '广州']

        # 来源
        Vendor_List = ['猎聘', '智联', '前程', '领英', '51']

# 子函数
# 抽取器 抽取单个文件的信息
class Extractor (object):  
        # 读取文件目录
        def __init__ (self, file_dir):
                self.fullWord = []
                self.fullText = ""
                self.file_dir = file_dir 
                if os.path.splitext(self.file_dir)[1] == ".pdf":
                        pdf =
                for page in pdf.pages:
                        self.fullWord += page.extract_words()
                        self.fullText += page.extract_text() if page.extract_text() else ""
        # 必要部分:姓名 应聘职位 专业 联系电话 附件下载 来源 性别
        # 01 搜索姓名函数  Name
        def __search_Name (self):
                result = ""
                names = []
                full_text = self.fullText
                # 查看是否在文件名下 但是3位容易出现 4位名字扫不到 反之 出现李强简历之类的 
                dir_Set = re.findall(r"[\u4e00-\u9fa5]{2,3}", ((self.file_dir).split("\\"))[-1] )
                if (len(dir_Set) > 0):
                        for TempDir in dir_Set: 
                                if (TempDir[0] in Surname_List):
                                        return TempDir

                # 查看是否在姓名字段下 一般认为出现在前十五行 所以设置count遍历
                for line in full_text.split("\n"):

                        # 是否在姓名字段下
                        if"姓[ ]+名", line):
                                name = re.findall(r"姓[ ]+名[ :\\n]+[\u4e00-\u9fa5]{2,4}", line)[0]
                                names.append(re.sub(r"[姓名::\s]", "", name))
                        # 没有姓名字段 则分解该行 看看是不是有带有合适的姓氏的中文词汇
                                regex_str = "[" + "|".join(Surname_List) +"]" +r'[\u4e00-\u9fa5]{1,3}'
                                nameset = re.findall (regex_str, line)
                                if len(nameset): return nameset[0]
                                names += nameset                     

                # 筛选好 names 嫌疑集合 对 names 集合内的元素鉴定是否有姓氏 返回有姓氏的那个
                for TmpName in names: 
                        if (TmpName[0] in Surname_List):
                                result = TmpName
                                return result   
                return result
        # 02 搜索应聘职位  Jobs
        def __search_Jobs (self):
                result = ""
                jobs = []
                full_text = self.fullText
                JobTitle_List = ["期望职位", "应聘职位", "期望从事职位"]
                for line in full_text.split("\n"):
                        # 是否在职位字段下
                        if any(title in line for title in JobTitle_List):
                                for title in JobTitle_List:
                                        if, line):
                                                job_List = re.findall(r"\s*" + title + "[::\s]*[a-z|A-Z|0-9|\u4e00-\u9fa5]{2,14}", line)
                                                if (len(job_List) > 0): 
                                                        job = job_List[0]
                                                        job = re.sub(title + r"[::*\s]", "", job)
                                                        job = re.sub("\s", "", job)
                                                        return job
                return ""
        # 03 搜索专业函数  Major
        def __search_Major (self):
                majors = []
                major = ""  
                result = ""
                full_text   = self.fullText
                full_words  = self.fullWord
                # 在专业字段中寻找  
                for line in full_text.split("\n"):
                        # 51 job        
                        if"专[ ]+业*", line):
                                majorList = re.findall(r"专[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[专业::\s]", "", major))
                        # 猎聘通
                        if"\s*行[ ]+业*", line):
                                majorList = re.findall(r"\s*行[ ]+业[::\s]*[\u4e00-\u9fa5]{2,10}", line)
                                if (len(majorList) > 0): major = majorList[0]
                                majors.append(re.sub(r"[行业::\s]", "", major))
                        for premajor in Major_List:
                                if premajor in line:
                                        return premajor
                if (len(majors) > 0): 
                        if (len(majors[0]) > 0):
                                return majors[0]
                # 在正文部分中寻找 带有专业或者系的字段
                for word in full_words:
                        text = ""
                        textMajor = ""
                        if os.path.splitext(self.file_dir)[1] == ".pdf": text = word["text"]
                        else: text = word
                        # 中文专业 尴尬的事情是扫码联系
                        if "专业" or "系" or "技术" in text:
                                for m in re.findall(r"[\u4e00-\u9fa5]{2,10}?(?:专业|系|技术)", text):
                                        if "专业" or "系" or "技术" in m:
                                                textMajor = m;
                                if textMajor != "": break
                        # 英文专业 这一部分还需要修改
                        elif "Bsc" or "Major" or "Msc" in text:
                                for m in re.findall(r"[a-Z]{2,5}?(?:(Bsc)|(Msc)|Major)", text):
                                        if "Bsc" or "Major" or "Msc" in m:
                                                textMajor = m;
                                if textMajor != "": break
                if len(majors) > 0: 
                        for m in majors:
                                if (len(m) == 0): continue
                                result = m
                return result  
        # 04 搜索电话信息  
                # Area Code and Telephone 暂时没有想到这里该怎么做 带有区号的和不带区号的 还有 Tail 要研究一下
        def __search_Phone (self):
                # 找到含有11位数字的字符串段
                full_text = self.fullText
                phone   = ""
                number  = ""
                number_List = []
                # 通过关键词查找
                for line in full_text.split("\n"):
                        if"电\s*话", line) or"手\s*机", line):
                                # 去除标点符号
                                line = re.sub(r"[()()::+\-]", "", line)
                                # 选择 11 到 15 位长度的数字
                                number_List = re.findall(r"\d{11,15}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number                    
                # 直接通过数字长度查找 返回符合要求的集合
                        if phone == "":
                                text   = re.sub(r"[()()+\-]", "", full_text)
                                phones = re.findall(r"\d{11,15}", text)
                                phone  = ",".join(set(phones))
                return phone
        # 06 确认来源信息
        def __search_Vendor (self):
                directory = self.file_dir
                full_text = self.fullText
                # 在目录中寻找
                for vendor in Vendor_List:
                        if vendor in self.file_dir:
                                return vendor

                # 在字段中寻找
                count = 0
                for line in full_text.split("\n"):
                        if (count > 20): break
                        for vendor in Vendor_List:
                                if vendor in line: return vendor
                        count += 1

                return ""
        # 07 搜索性别函数  Gender 没写男女就只能通过照片去判断
        def __search_Gender (self):
                gender = "" 
                full_text  = self.fullText
                full_words  = self.fullWord
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 15): break

                        # 性别字段
                        if"性[ ]+别*", line):
                                gender = re.findall(r"性[ ]+别[::\s]*[\u4e00-\u9fa5]{2,10}", line)[0]
                        # 识别到男性字段
                        if"男", line) or"Male", line): 
                                gender = "男"
                                return gender
                        # 识别到女性字段
                        if"女", line) or"Female", line): 
                                gender = "女"
                                return gender
                        counter += 1                                
                return gender
        # 可选部分: 
        # 08 搜索年龄函数
        def __search_Age (self):
                Curr_Year =
                number = ""
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        # 获取出生年月
                        if"出生年月", line):
                                number_List = re.findall(r"\d{4,4}", line)
                                if (len(number_List) > 0): number = number_List[0]
                                Age = Curr_Year - int(number)
                                return str(Age) 
                        # 获取岁
                        if"\s*岁", line):
                                number_List = re.findall(r"\d{1,2}", line)
                                if (len(number_List) > 0): 
                                        number = number_List[0]
                                        return number  
                return ""
        # 09 判断在职状态
        def __search_Cond (self):
                full_text  = self.fullText
                counter = 0
                # 在专业字段中寻找  
                for line in full_text.split("\n"):        
                        # 限制第十五行以内
                        if (counter > 20): break
                        if"离职", line): return "离职"
                        if"正在找工作", line): return "正在找工作"
                        if"在职", line): return "在职"
                        counter += 1                                
                return ""
        # 10 搜索城市函数
        def __search_City (self):
                locations = []
                location = ""  
                full_text  = self.fullText
                # 在地点字段中寻找  
                for line in full_text.split("\n"):
                        if"\s*地点", line):
                                loc_List = re.findall(r"\s*地点[::\s]*[\u4e00-\u9fa5]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地点::\s]", "", location))
                        if"所在地", line) or"现居地", line):
                                loc_List = re.findall(r"\s*地[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): location = loc_List[0]
                                locations.append(re.sub(r"[地::\s]", "", location))
                        if"住\s*址", line) or"现居住", line)  or"Location", line):
                                # 住址
                                loc_List = re.findall(r"住\s*址[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[住址::\s]", "", location))
                                # 现居住
                                loc_List = re.findall(r"现居住[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[现居住::\s]", "", location))
                                # Location
                                loc_List = re.findall(r"Location[::\s]*[\u4e00-\u9fa5\s]{2,8}", line)
                                if (len(loc_List) > 0): 
                                        location = loc_List[0]
                                        locations.append(re.sub(r"[Location::\s]", "", location))
                if (len(locations) > 0): location = locations[0]
                return location
        # 11 搜索学历函数
        def __search_Stage (self):
                stage = ""  
                full_text  = self.fullText
                setPhd = ["博士"]
                setMsc = ["硕士", "研究生"]
                setBsc = ["大学", "本科"]
                setByd = ["大专", "专科"]
                setOth = ["学院"]
                setSta = setPhd + setMsc + setBsc + setByd + setOth
                # 在学历字段中寻找  
                for line in full_text.split("\n"):
                        if (any (TempStr in line for TempStr in setSta)):
                                if (any (TempStr in line for TempStr in setPhd)): stage =  "博士"
                                if (any (TempStr in line for TempStr in setMsc)): stage =  "硕士"
                                if (any (TempStr in line for TempStr in setBsc)): stage =  "本科"
                                if stage != "": return stage
                if stage == "": return "专科"                    
                return stage
        # 12 搜索籍贯函数
        def __search_Hometown (self):
                return ""
        # 13 搜索
        # 14 搜索
        # 15 搜索
        # 16 搜索
        # 17 搜索
        # 18 搜索
        # 19 搜索
        # 20 搜索
        # 21 搜索
        # 22 搜索邮箱函数  Email
        def __search_Email (self):
                # 找到含有 @ 和 . 的字符串段
                full_words  = self.fullWord
                full_text  = self.fullText
                email = ""
                email_List = []
                newEmail = ""
                # 先查看邮箱栏下是否有邮箱可以直接选用
                for line in full_text.split("\n"):
                        if"邮[ ]+箱", line):
                                newEmail = re.findall(r"[a-zA-Z0-9_\-.@]+", line)[0]
                                email_List.append(re.sub(r"[邮箱::\s]", "", newEmail))
                if (len(email_List) > 0):
                        for TempEmail in email_List:
                                if '@' in TempEmail:
                                        email = email_List[0] 
                                        return email
                # 再遍历所有的 word 寻找邮箱特殊的关键词
                for word in full_words:
                        if os.path.splitext(self.file_dir)[1] == ".pdf":
                                text = word["text"]
                                text = word
                        if "@" in text and "." in text:
                                for e in re.findall(r"[a-zA-Z0-9_\-.@]+", text):
                                        if "@" in e:
                                                email = e
                                if email != "": break
                return email

        # 搜索技能函数
        def __search_Skill (self):
                Skills = []
                skill  = ""
                full_text  = self.fullText
                for line in full_text.split("\n"):
                        key = ""
                        for keyword in Skillset_List:            
                                if, line) and (key == ""):
                                        key = "Added"
                return Skills
        # 入口函数 返回搜索结果
        def search (self):
                # 用 \\ 或者 / 区分后 目录名为倒数第二个字符串 文件名为倒数第一个
                sep_dir = re.split(r"/+|\\+", self.file_dir)
                directory = sep_dir[-2]
                file_name = sep_dir[-1]
                info = {"directory": directory, "file_name": file_name, "user_name": "", "email": "", "phone": "", "gender": "", "stage": "", "major": "", "age": "", "city": "", "skill": "", "jobs": "", "vendor": "", "condition": ""}

              # 下面的一大段之后想写一个函数替代
                # 查找姓名
                        info["user_name"] = self.__search_Name()
                except Exception as e: print("User_Name: " + e)
                # 查找 Email
                        info["email"] = self.__search_Email()
                except Exception as e: print("Email: " + e)
                # 查找 Phone
                        info["phone"] = self.__search_Phone()             # 无奈之举选择前 11 位 之后需要做实验和讨论
                except Exception as e: print("Phone: " + e)
                # 查找 Major
                        info["major"] = self.__search_Major()
                except Exception as e: print("Major: " + e)
                # 查找 Gender
                        info["gender"] = self.__search_Gender()
                except Exception as e: print("Gender :" + e)
                # 查找 Stage
                        info["stage"] = self.__search_Stage()
                except Exception as e: print("Stage: " + e)
                # 查找 City
                        info["city"] = self.__search_City()
                except Exception as e: print("City: " + e)

                # 查找 Age
                        info["age"] = self.__search_Age()
                except Exception as e: print("Age: " + e)
                # 查找 Skills
                        info["skill"] = self.__search_Skill()
                except Exception as e: print("Skill: " + e)
                # 查找 Jobs
                        info["jobs"] = self.__search_Jobs()
                except Exception as e: print("Jobs: " + e)
                # 查找 Vendor
                        info["vendor"] = self.__search_Vendor()
                except Exception as e: print("Vendor: " + e)
                return info

# 猎聘
# class Liepin (object):

# 智联
# class Zhilian (object):
# 前程无忧
# class Qiancheng (object):

# 51jobs
# class Jobs (object):

# 遍历并读取函数
class Reader (object):
        # 初始化
        def __init__ (self, folder_Path):
                self.path = folder_Path
        # 遍历文件夹内所有的文件, type是一段字符串 标注文件类型
        def read (self, type):
                ResumePath = []
                allfilelist = os.listdir(self.path)
                for file in allfilelist:
                        # 生成简历文件路径 判断是否位文件
                        filepath = os.path.join(FolderPath, file)
                        if os.path.isfile(filepath):
                                # 遍历所有符合type类型的简历
                                if (filepath.find(type) != -1) and (filepath.find("$") == -1):
                return ResumePath

# 输出生成函数
class Generator (object):
        # 初始化
        def __init__ (self, sourceInfo):
       = sourceInfo
        # 打印呈现
        def display (self):
                result =
                print("################### Candidate ", counter, " ###################")
                # Necessary info
                print("Name     : ", result["user_name"])
                print("Position :", result["jobs"])
                print("Major    : ", result["major"])
                print("Phone    : ", result["phone"])
                print("Gender   : ", result["gender"])
                print("Source   : ", result["file_name"])
                print("Vendor   : ", result["vendor"])
                print("Condition: ", result["condition"])
                # Optional Info
                print("Email    : ", result["email"])
                print("City     : ", result["city"])
                print("Age      : ", result["age"])
                print("Stage    : ", result["stage"])  
                # print("SkillSet : ", "\n".join(result["skill"]))
        # 生成 Json
        def generate_Json (self):
                        data_Json = json.dumps (, sort_keys = True, indent = 4, separators=(',',':'), ensure_ascii = False)
                except Exception as e: print(e)
                return data_Json

# Json形式下的简历信息发布至 dataverse (Power BI)
class dataverse_Publish (object):
        # 初始化
        def __init__ (self, sourceJson):
                self.source = sourceJson
        # 主要函数
        def process (self):
                sourceFile = "TestJson.json"
                # 链接 api 接口
                from pyDataverse.api import NativeApi
                api = NativeApi(BASE_URL, API_TOKEN)
                # Create Collection of data
                from pyDataverse.models import Dataverse
                from pyDataverse.utils import read_file
                dv = Dataverse()
                resp = api.create_dataverse (":root", dv.json())
                resp = api.publish_dataverse ("Dataverse_Resumes")
                resp = api.get_dataverse ("Dataverse_Resumes")

# 主函数
if __name__ == "__main__":
        # Step 1: 遍历该文件夹下的所有简历文件        
        PdfResumePath  = Reader(folder_Path = FolderPath).read(".pdf")
        DocxResumePath = Reader(folder_Path = FolderPath).read(".docx")
        # Step 2: 在上面读取的简历集合下遍历所有简历并读取信息 输出到json文件内
        counter = 0
        Json_filename = "resume_Result.json"
        Json_file = open(Json_filename, 'w', encoding = 'utf-8')       # 定位到 Position 0
        Json_file.truncate()    # 清空 Json 文件
        for file in PdfResumePath:
                # 导出字典下的内容
                counter = counter + 1
                # if (counter < 6): continue
                info = Extractor(file_dir = file).search()
                # 呈现内容并写入 Json 文件内
                Generator(sourceInfo = info).display()
                Result_Json = Generator(sourceInfo = info).generate_Json()
                Json_file.write(Result_Json + "\n")
        # 复制到仓库中
        # Step 3: 导出到 dataverse
        # dataverse_Publish(sourceJson = Json_filename).process()


# 函数 读取信息             
# print (ResumePath[0])
# xingming_node = document_tree.getElementsByTagName("XingMing")[0]
# xingming = xingming_node.childNodes[0].data
# 函数 将一份简历信息写入 Excel 文件
# print (ResumePath)
# print (filename)

# Test for pyDataverse

from pyDataverse.api    import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse
from pyDataverse.models import Datafile
from pyDataverse.models import Dataset
from pyDataverse.utils  import read_file
from pyDataverse.api    import NativeApi

# Input: the website of Dataverse and Token
BASE_URL  = ""

class pyDataFunc (object):
    # 初始化
    def __init__ (self):
        self.baseURL  = BASE_URL
        self.apiTOKEN = API_TOKEN
    # 连接api并尝试访问 返回 api()
    def getAPI (self):
        # 要求输入两个字符串
        assert (isinstance(self.baseURL, str) and isinstance(self.apiTOKEN, str))
        api  = NativeApi(self.baseURL, self.apiTOKEN)
        response = api.get_info_version()
        print (response)
        return response
    # 配置数据空间 Dataverse Collection 返回 dataverse()
    def setDataverse (self, filename):
        # 要求输入是一个 json 文件
        assert(isinstance(filename, str) and (".json" in filename))
        dv = Dataverse()
        return dv
    # 配置数据集 Dataset  返回 dataset()
    def setDataset (self, filename):
        # 要求输入是一个 json 文件
        assert(isinstance(filename, str) and (".json" in filename))
        ds = Dataset()
        return ds
    # 创建数据空间 Dataverse 返回创建结果
    def createDataverse (self, dv, verseLoc, verseName):
        # setName
        assert(isinstance(verseName, str))

        response = api.create_dataverse (verseLoc, dv.json())
        return response
    # 创建数据集 Dataset 返回创建结果
    def createDataset (self, ds, setName):
        # setName
        assert(ds.validate_json() and isinstance(setName, str))

        response = api.create_dataset(setName, ds.json())
        return response
    # 获取 pid 返回 pid 结果
    def getPID (self, resp):
        ds_pid = resp.json()["data"]["persistentId"]
        return ds_pid
    # 上传数据文件
    def uploadDatafile (self, filename, pid):
        # 要求字符串
        assert(isinstance(filename, str))

        df = Datafile()
        df.set({'pid': pid, 'filename': filename})
        response = api.upload_datafile(pid, filename, df.json())

if __name__ == "__main__":
    filename = ""
    ds  = pyDataFunc.setDataset (filename)
    rs1 = pyDataFunc.createDataset (ds, "A set name")
    pid = pyDataFunc.getPID (rs)
    rs2 = pyDataFunc.uploadDatafile(filename, pid)
    # 输出结果 .get()[""]
    # 验证是不是合格的json文件 validate_json()
from selenium import webdriver

# Deploy Driver
class function (object):
    def __init__ (self, inputurl):
        # 配置浏览器
        self.url = inputurl
        self.driver = webdriver.Chrome(r"C:/Alan .AIA/Python/Driver/chromedriver.exe")
    def getBing (self):

# Main Function
if __name__ == "__main__":
    function(inputurl = "").getBing()

# -*- coding:utf-8 -*-
# Author: juzstu
# Time: 2019/8/22 0:31

import pandas as pd
import numpy as np
import jieba as jb
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import warnings
from tqdm import tqdm
from joblib import Parallel, delayed


def modified_jd_df(jd_path):
    tmp_list = []
    tmp_file = open(jd_path, encoding='utf8')
    for i, j in enumerate(tmp_file.readlines()):
        if i == 175425:
            j = j.replace('销售\t|置业顾问\t|营销', '销售|置业顾问|营销')
        tmp = j.split('\t')
    return pd.DataFrame(tmp_list[1:], columns=tmp_list[0])

def get_min_salary(x):
    if len(x) == 12:
        return int(x[:6])
    elif len(x) == 10:
        return int(x[:5])
    elif len(x) == 11:
        return int(x[:5])
    elif len(x) == 9:
        return int(x[:4])
        return -1

def get_max_salary(x):
    if len(x) == 12:
        return int(x[6:])
    elif len(x) == 10:
        return int(x[5:])
    elif len(x) == 11:
        return int(x[5:])
    elif len(x) == 9:
        return int(x[4:])
        return -1

def is_same_user_city(df):
    live_city_id = str(df['live_city_id'])
    desire_jd_city = df['desire_jd_city_id']
    return live_city_id in desire_jd_city

def jieba_cnt(df):
    experience = df['experience']
    jd_title = df['jd_title']
    jd_sub_type = df['jd_sub_type']
    if isinstance(experience, str) and isinstance(jd_sub_type, str):
        tmp_set = set(jb.cut_for_search(jd_title)) | set(jb.cut_for_search(jd_sub_type))
        experience = set(jb.cut_for_search(experience))
        tmp_cnt = 0
        for t in tmp_set:
            if t in experience:
                tmp_cnt += 1
        return tmp_cnt
        return 0

def cur_industry_in_desire(df):
    cur_industry_id = df['cur_industry_id']
    desire_jd_industry_id = df['desire_jd_industry_id']
    if isinstance(cur_industry_id, str) and isinstance(desire_jd_industry_id, str):
        return cur_industry_id in desire_jd_industry_id
        return -1

def desire_in_jd(df):
    desire_jd_type_id = df['desire_jd_type_id']
    jd_sub_type = df['jd_sub_type']
    if isinstance(jd_sub_type, str) and isinstance(desire_jd_type_id, str):
        return jd_sub_type in desire_jd_type_id
        return -1

def get_tfidf(df, names, merge_id):
    tfidf_enc_tmp = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec_tmp = tfidf_enc_tmp.fit_transform(df[names])
    svd_tag_tmp = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    tag_svd_tmp = svd_tag_tmp.fit_transform(tfidf_vec_tmp)
    tag_svd_tmp = pd.DataFrame(tag_svd_tmp)
    tag_svd_tmp.columns = [f'{names}_svd_{i}' for i in range(10)]
    return pd.concat([df[[merge_id]], tag_svd_tmp], axis=1)

def get_str(x):
    return ' '.join([i for i in jb.cut(x) if i not in stop_words])

def offline_eval_map(train_df, label, pred_col):
    tmp_train = train_df.copy()
    tmp_train['rank'] = tmp_train.groupby('user_id')[pred_col].rank(ascending=False, method='first')
    tmp_x = tmp_train[tmp_train[label] == 1]
    tmp_x[f'{label}_index'] = tmp_x.groupby('user_id')['rank'].rank(ascending=True, method='first')
    tmp_x['score'] = tmp_x[f'{label}_index'] / tmp_train['rank']
    return  tmp_x.groupby('user_id')['score'].mean().mean()

def sub_on_line(train_, test_, pred, label, cate_cols, is_shuffle=True, use_cate=True):
    print(f'data shape:\ntrain--{train_.shape}\ntest--{test_.shape}')
    n_splits = 5
    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=1024)
    sub_preds = np.zeros((test_.shape[0], folds.n_splits))
    train_[f'{label}_pred'] = 0
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = pred
    print(f'Use {len(pred)} features ...')
    auc_scores = []
    params = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 63,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    train_user_id = train_['user_id'].unique()
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print(f'the {n_fold} training start ...')
        train_x, train_y = train_.loc[train_['user_id'].isin(train_user_id[train_idx]), pred], train_.loc[
            train_['user_id'].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), pred], train_.loc[
            train_['user_id'].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')
        if use_cate:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
        sub_preds[:, n_fold - 1] = clf.predict(test_[pred], num_iteration=clf.best_iteration)
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance()
        train_.loc[train_['user_id'].isin(train_user_id[valid_idx]), f'{label}_pred'] = \
            clf.predict(valid_x, num_iteration=clf.best_iteration)

    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    fold_importance_df[['Feature', 'avg_imp']].to_csv('feat_imp_base.csv', index=False, encoding='utf8')
    test_[label] = np.mean(sub_preds, axis=1)
    print('auc score', np.mean(auc_scores))
    return test_[['user_id', 'jd_no', label]], train_[['user_id', 'jd_no', f'{label}_pred', label]]


if __name__ == "__main__":
    min_work_year = {103: 1, 305: 3, 510: 5, 1099: 10}
    max_work_year = {103: 3, 305: 5, 510: 10}
    degree_map = {'其他': 0, '初中': 1, '中技': 2, '中专': 2, '高中': 2, '大专': 3, '本科': 4,
                  '硕士': 5, 'MBA': 5, 'EMBA': 5, '博士': 6}

    sub_path = './submit/'
    train_data_path = './zhaopin_round1_train_20190716/'
    test_data_path = './zhaopin_round1_test_20190716/'
    train_user = pd.read_csv(train_data_path + 'table1_user', sep='\t')
    train_user['desire_jd_city_id'] = train_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))
    train_user['desire_jd_salary_id'] = train_user['desire_jd_salary_id'].astype(str)
    train_user['min_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_min_salary)
    train_user['max_desire_salary'] = train_user['desire_jd_salary_id'].apply(get_max_salary)
    train_user['min_cur_salary'] = train_user['cur_salary_id'].apply(get_min_salary)
    train_user['max_cur_salary'] = train_user['cur_salary_id'].apply(get_max_salary)
    train_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)
    train_jd = pd.read_csv(train_data_path + 'table2_jd.csv', sep='\t')
    train_jd.drop(['company_name', 'max_edu_level', 'is_mangerial', 'resume_language_required'], axis=1, inplace=True)

    train_jd['min_work_year'] = train_jd['min_years'].map(min_work_year)
    train_jd['max_work_year'] = train_jd['min_years'].map(max_work_year)
    train_jd['start_date'].replace(r'\N', '22000101', inplace=True)
    train_jd['end_date'].replace(r'\N', '22000101', inplace=True)
    train_jd['start_date'] = pd.to_datetime(train_jd['start_date'].astype(str).apply(lambda x:
    train_jd['end_date'] = pd.to_datetime(train_jd['end_date'].astype(str).apply(lambda x: f'{x[:4]}-{x[4:6]}-{x[6:]}'))
    train_jd.loc[train_jd['end_date'] == '2200-01-01', ['start_date', 'end_date']] = np.nan

    stop_words = [i.strip() for i in open('中文停用词表.txt', 'r', encoding='utf8').readlines()]
    stop_words.extend(['\n', '\xa0', '\u3000', '\u2002'])
    tmp_cut = Parallel(n_jobs=-1)(delayed(get_str)(train_jd.loc[ind]['job_description\n'])
                                  for ind in tqdm(train_jd.index))

    tfidf_enc = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vec = tfidf_enc.fit_transform(tmp_cut)
    svd_tag = TruncatedSVD(n_components=10, n_iter=20, random_state=2019)
    tag_svd = svd_tag.fit_transform(tfidf_vec)
    tag_svd = pd.DataFrame(tag_svd)
    tag_svd.columns = [f'desc_svd_{i}' for i in range(10)]
    train_jd = pd.concat([train_jd, tag_svd], axis=1)

    train_action = pd.read_csv(train_data_path + 'table3_action', sep='\t')
    train_action['user_jd_cnt'] = train_action.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').values
    train_action['jd_cnt'] = train_action.groupby(['user_id'])['jd_no'].transform('count').values
    train_action['jd_nunique'] = train_action.groupby(['user_id'])['jd_no'].transform('nunique').values
    train_action = train_action.drop_duplicates()
    train_action.sort_values(['user_id', 'jd_no', 'delivered', 'satisfied'], inplace=True)
    train_action = train_action.drop_duplicates(subset=['user_id', 'jd_no'], keep='last')
    train_action = train_action[train_action['jd_no'].isin(train_jd['jd_no'].unique())]

    train = train_action.merge(train_user, on='user_id', how='left')
    train = train.merge(train_jd, on='jd_no', how='left')
    del train['browsed']

    print('train data base feats already generated ...')

    test_user = pd.read_csv(test_data_path + 'user_ToBePredicted', sep='\t')
    test_user['desire_jd_city_id'] = test_user['desire_jd_city_id'].apply(lambda x: re.findall('\d+', x))
    test_user['desire_jd_salary_id'] = test_user['desire_jd_salary_id'].astype(str)
    test_user['min_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_min_salary)
    test_user['max_desire_salary'] = test_user['desire_jd_salary_id'].apply(get_max_salary)
    test_user['min_cur_salary'] = test_user['cur_salary_id'].apply(get_min_salary)
    test_user['max_cur_salary'] = test_user['cur_salary_id'].apply(get_max_salary)
    test_user.drop(['desire_jd_salary_id', 'cur_salary_id'], axis=1, inplace=True)

    test = pd.read_csv(test_data_path + 'zhaopin_round1_user_exposure_B_20190819', sep=' ')
    test['user_jd_cnt'] = test.groupby(['user_id', 'jd_no'])['jd_no'].transform('count').values
    test['jd_cnt'] = test.groupby(['user_id'])['jd_no'].transform('count').values
    test['jd_nunique'] = test.groupby(['user_id'])['jd_no'].transform('nunique').values
    test = test.drop_duplicates()

    test['delivered'] = -1
    test['satisfied'] = -1

    test = test.merge(test_user, on='user_id', how='left')
    test = test.merge(train_jd, on='jd_no', how='left')

    print('test data base feats already generated ...')

    all_data = train.append(test, sort=False)

    all_data['jd_user_cnt'] = all_data.groupby(['jd_no'])['user_id'].transform('count').values
    all_data['same_user_city'] = all_data.apply(is_same_user_city, axis=1).astype(int)
    all_data['city'].fillna(-1, inplace=True)
    all_data['city'] = all_data['city'].astype(int)
    all_data['same_com_live'] = (all_data['city'] == all_data['live_city_id']).astype(int)
    all_data['min_edu_level'] = all_data['min_edu_level'].apply(lambda x: x.strip() if isinstance(x, str) else x)
    all_data['cur_degree_id'] = all_data['cur_degree_id'].apply(lambda x: x.strip() if isinstance(x, str) else x)
    all_data['min_edu_level_num'] = all_data['min_edu_level'].map(degree_map)
    all_data['cur_degree_id_num'] = all_data['cur_degree_id'].map(degree_map)
    all_data['same_edu'] = (all_data['min_edu_level'] == all_data['cur_degree_id']).astype(int)
    all_data['gt_edu'] = (all_data['cur_degree_id_num'] >= all_data['min_edu_level_num']).astype(int)
    all_data['min_desire_salary_num'] = (all_data['min_desire_salary'] <= all_data['min_salary']).astype(int)
    all_data['min_cur_salary_num'] = (all_data['min_cur_salary'] <= all_data['min_salary']).astype(int)

    all_data['max_desire_salary_num'] = (all_data['max_desire_salary'] <= all_data['max_salary']).astype(int)
    all_data['max_cur_salary_num'] = (all_data['max_cur_salary'] <= all_data['max_salary']).astype(int)
    all_data['same_desire_industry'] = all_data.apply(cur_industry_in_desire, axis=1).astype(int)
    all_data['same_jd_sub'] = all_data.apply(desire_in_jd, axis=1).astype(int)

    all_data['start_month'] = all_data['start_date'].dt.month
    all_data['start_day'] = all_data['start_date']
    all_data['end_month'] = all_data['start_date'].dt.month
    all_data['end_day'] = all_data['start_date']
    all_data['jd_days'] = (all_data['end_date'] - all_data['start_date']).dt.days

    all_data['user_work_year'] = 2019 - all_data['start_work_date'].replace('-', np.nan).astype(float)
    all_data['gt_min_year'] = (all_data['user_work_year'] > all_data['min_work_year']).astype(int)
    all_data['gt_max_year'] = (all_data['user_work_year'] > all_data['max_work_year']).astype(int)
    all_data['len_experience'] = all_data['experience'].apply(
        lambda x: len(x.split('|')) if isinstance(x, str) else np.nan)
    all_data['desire_jd_industry_id_len'] = all_data['desire_jd_industry_id'].apply(
        lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)
    all_data['desire_jd_type_id_len'] = all_data['desire_jd_type_id'].apply(
        lambda x: len(x.split(',')) if isinstance(x, str) else np.nan)
    all_data['eff_exp_cnt'] = all_data.apply(jieba_cnt, axis=1)
    all_data['eff_exp_ratio'] = all_data['eff_exp_cnt'] / all_data['len_experience']
    all_data.drop(['cur_degree_id_num', 'cur_degree_id', 'desire_jd_city_id', 'min_years',
                   'start_work_date', 'start_date', 'end_date', 'key', 'min_edu_level'], axis=1, inplace=True)

    # 城市统计
    all_data['user_jd_city_nunique'] = all_data.groupby('user_id')['city'].transform('nunique').values
    all_data['jd_user_city_nunique'] = all_data.groupby('jd_no')['live_city_id'].transform('nunique').values

    all_data['jd_title_nunique'] = all_data.groupby('user_id')['jd_title'].transform('nunique').values
    all_data['jd_sub_type_nunique'] = all_data.groupby('user_id')['jd_sub_type'].transform('nunique').values

    all_data['user_desire_jd_industry_id_nunique'] = all_data.groupby('jd_no')['desire_jd_industry_id'].transform(
    all_data['user_desire_jd_type_id_nunique'] = all_data.groupby('jd_no')['desire_jd_type_id'].transform(

    # 薪资
    all_data['user_jd_min_salary_min'] = all_data.groupby('user_id')['min_salary'].transform('min').values
    all_data['user_jd_min_salary_max'] = all_data.groupby('user_id')['min_salary'].transform('max').values
    all_data['user_jd_min_salary_mean'] = all_data.groupby('user_id')['min_salary'].transform('mean').values
    all_data['user_jd_min_salary_std'] = all_data.groupby('user_id')['min_salary'].transform('std').values

    all_data['user_jd_max_salary_min'] = all_data.groupby('user_id')['max_salary'].transform('min').values
    all_data['user_jd_max_salary_max'] = all_data.groupby('user_id')['max_salary'].transform('max').values
    all_data['user_jd_max_salary_mean'] = all_data.groupby('user_id')['max_salary'].transform('mean').values
    all_data['user_jd_max_salary_std'] = all_data.groupby('user_id')['max_salary'].transform('std').values

    all_data['jd_user_desire_min_salary_min'] = all_data.groupby('jd_no')['min_desire_salary'].transform('min').values
    all_data['jd_user_desire_min_salary_max'] = all_data.groupby('jd_no')['min_desire_salary'].transform('max').values
    all_data['jd_user_desire_min_salary_mean'] = all_data.groupby('jd_no')['min_desire_salary'].transform('mean').values
    all_data['jd_user_desire_min_salary_std'] = all_data.groupby('jd_no')['min_desire_salary'].transform('std').values

    all_data['jd_user_desire_max_salary_min'] = all_data.groupby('jd_no')['max_desire_salary'].transform('min').values
    all_data['jd_user_desire_max_salary_max'] = all_data.groupby('jd_no')['max_desire_salary'].transform('max').values
    all_data['jd_user_desire_max_salary_mean'] = all_data.groupby('jd_no')['max_desire_salary'].transform('mean').values
    all_data['jd_user_desire_max_salary_std'] = all_data.groupby('jd_no')['max_desire_salary'].transform('std').values

    all_data['jd_days_min'] = all_data.groupby('user_id')['jd_days'].transform('min').values
    all_data['jd_days_max'] = all_data.groupby('user_id')['jd_days'].transform('max').values
    all_data['jd_days_mean'] = all_data.groupby('user_id')['jd_days'].transform('mean').values
    all_data['jd_days_std'] = all_data.groupby('user_id')['jd_days'].transform('std').values
    all_data['jd_days_skew'] = all_data.groupby('user_id')['jd_days'].transform('skew').values

    all_data['age_min'] = all_data.groupby('jd_no')['birthday'].transform('min').values
    all_data['age_max'] = all_data.groupby('jd_no')['birthday'].transform('max').values
    all_data['age_mean'] = all_data.groupby('jd_no')['birthday'].transform('mean').values
    all_data['age_std'] = all_data.groupby('jd_no')['birthday'].transform('std').values
    all_data['age_skew'] = all_data.groupby('jd_no')['birthday'].transform('skew').values

    for j in ['jd_title', 'jd_sub_type']:
        le = LabelEncoder()
        all_data[j].fillna('nan', inplace=True)
        all_data[f'{j}_map_num'] = le.fit_transform(all_data[j])

    all_data['experience'] = all_data['experience'].apply(lambda x: ' '.join(x.split('|') if
                                                                             isinstance(x, str) else 'nan'))
    exp_gp = all_data.groupby('jd_no')['experience'].agg(lambda x: ' '.join(x.to_list())).reset_index()
    exp_gp = get_tfidf(exp_gp, 'experience', 'jd_no')
    all_data = all_data.merge(exp_gp, on='jd_no', how='left')

    use_feats = [c for c in all_data.columns if c not in ['user_id', 'jd_no', 'delivered', 'satisfied'] +
                 ['desire_jd_industry_id', 'desire_jd_type_id', 'cur_industry_id', 'cur_jd_type', 'experience',
                 'jd_title', 'jd_sub_type', 'job_description\n']]

    sub_sat, train_pred_sat = sub_on_line(all_data[all_data['satisfied'] != -1], all_data[all_data['satisfied'] == -1],
                                          use_feats, 'satisfied', ['live_city_id', 'city'], use_cate=True)

    sub_dev, train_pred_dev = sub_on_line(all_data[all_data['delivered'] != -1], all_data[all_data['delivered'] == -1],
                                          use_feats, 'delivered', ['live_city_id', 'city'], use_cate=True)

    train_pred_sat['merge_pred'] = train_pred_sat['satisfied_pred'] * 0.8 + train_pred_dev['delivered_pred'] * 0.2
    sub_sat['merge_prob'] = sub_sat['satisfied'] * 0.8 + sub_dev['delivered'] * 0.2

    train_pred_sat = train_pred_sat.merge(all_data[all_data['delivered'] != -1][['user_id', 'jd_no', 'delivered']],
                                          on=['user_id', 'jd_no'], how='left')

    dev_map = offline_eval_map(train_pred_sat, 'delivered', 'merge_pred')
    sat_map = offline_eval_map(train_pred_sat, 'satisfied', 'merge_pred')
    print('dev map:', round(dev_map, 4), 'sat map:', round(sat_map, 4), 'final score:',
          round(0.7 * sat_map + 0.3 * dev_map, 4))

    sub_df = pd.DataFrame(columns=['user_id', 'jd_no', 'merge_prob'])
    for i in sub_sat['user_id'].unique():
        tmp_sub = sub_sat[(sub_sat['user_id'] == i) &
                            (sub_sat['jd_no'].isin(train_jd['jd_no']))].sort_values('merge_prob', ascending=False)[
                            ['user_id', 'jd_no', 'merge_prob']]
        sub_df = sub_df.append(tmp_sub)
        sub_df = sub_df.append(sub_sat[(sub_sat['user_id'] == i) & (~sub_sat['jd_no'].isin(train_jd['jd_no']))][
                                   ['user_id', 'jd_no', 'merge_rank']])
    sub_df[['user_id', 'jd_no']].to_csv('sub_base.csv', index=False)

import pandas as pd
import numpy  as np
import jieba
import json

# This file Asks Json in form [{},{},...,{}]

# Set up paths
FILEPATH = "C:\\Alan .AIA\\Python\\CV_Automation\\resume_Result.json"
CSV_PATH = "C:\\Alan .AIA\\Python\\CV_Automation\\"

# Sub Functions
def inputSource (sourcePath):
    filename = sourcePath.split("\\")[-1]
    if   (".json" in filename):
        targetDS  = pd.read_json(sourcePath, encoding = "utf-8")
        targetDS.to_csv(CSV_PATH + filename.split(".")[0] + ".csv", encoding = "utf_8_sig")
    elif (".csv" in filename):
        targetDS = pd.read_csv(sourcePath, encoding = "utf-8")
    return targetDS

# Main Function
if __name__ == "__main__":
    target = inputSource(sourcePath = FILEPATH)
// C#/MSAL

using Microsoft.Identity.Client;
using Newtonsoft.Json.Linq;
using System;
using System.Net.Http;
using System.Net.Http.Headers;

namespace PowerApps.Samples
    class Program
        static void Main()
            // TODO Specify the Dataverse environment name to connect with.
            string resource = "";

            // Azure Active Directory app registration shared by all Power App samples.
            // For your custom apps, you will need to register them with Azure AD yourself.
            // See
            var clientId = "51f81489-12ee-4a9e-aaae-a2591f45987d";
            var redirectUri = "app://58145B91-0C36-4500-8554-080854F2AC97";

            #region Authentication

            var authBuilder = PublicClientApplicationBuilder.Create(clientId)
            var scope = resource + "/.default";
            string[] scopes = { scope };

            AuthenticationResult token = 
            #endregion Authentication

            #region Client configuration

            var client = new HttpClient
                // See
                BaseAddress = new Uri(resource + "/api/data/v9.2/"),
                Timeout = new TimeSpan(0, 2, 0)    // Standard two minute timeout on web service calls.

            // Default headers for each Web API call.
            // See
            HttpRequestHeaders headers = client.DefaultRequestHeaders;
            headers.Authorization = new AuthenticationHeaderValue("Bearer", token.AccessToken);
            headers.Add("OData-MaxVersion", "4.0");
            headers.Add("OData-Version", "4.0");
                new MediaTypeWithQualityHeaderValue("application/json"));
            #endregion Client configuration

            #region Web API call

            // Invoke the Web API 'WhoAmI' unbound function.
            // See
            // See
            var response = client.GetAsync("WhoAmI").Result;

            if (response.IsSuccessStatusCode)
                // Parse the JSON formatted service response to obtain the user ID.  
                JObject body = JObject.Parse(
                Guid userId = (Guid)body["UserId"];

                Console.WriteLine("Your user ID is {0}", userId);
                Console.WriteLine("Web API call failed");
                Console.WriteLine("Reason: " + response.ReasonPhrase);
            #endregion Web API call

            // Pause program execution by waiting for a key press.
