pyspark+jieba分词 数据标注(提取动词)

 
  
from pyspark import SparkConf ,SparkContext
import jieba.posseg as posseg


def merge(list):
    result=[]
    for x in range(len(list)):
        result.extend(list[x])
    list.clear()
    return result

def split(line):
    # 分词 +  词性
    Two_verb = []
    verb_list = []
    seg_list = posseg.cut(line)
    for word,flag in seg_list:
        # if len(word)==1 and flag =='v':
        #    first_verb.append(word)
        #    #result0 += word + ','
        # elif len(word)==2 and flag =='v':
        #     Two_verb.append(word)
        # else:
        #     continue
           if len(word)<3 and flag=='v':
               verb_list.append(word)

    #dictMerged2 = dict(first_verb,**Two_verb)
    #first_verb

    return verb_list
    #return result0
def main(sc):

    #读取文件
    text = sc.textFile("D:/NAV.txt")
    #进行分词,结果转化列表
    word_list = text.map(split).collect()
    print(len(word_list))
    print(word_list)
    input_list=merge(word_list)
    print(input_list)
    #返回列表中的第一个元素
    count = sc.parallelize(input_list)
    results = count.map(lambda word:(word,1)).reduceByKey(lambda a,b:a+b).collect()
    #.repartition(1).saveAsTextFile("C:\\Users\\yunduo\\Desktop\\22.txt")
    print(results)

if __name__ =="__main__":

    #Create SparkConf
    sparkConf =SparkConf().setAppName('Python').setMaster('local[2]')
    #Create SparkContext
    sc=SparkContext(conf=sparkConf)
    main(sc)
    #print("Fv"+first_verb)
   # print("TV"+Two_verb)





2
[['打开', '背'], ['打开']]
['打开', '背', '打开']
[Stage 1:>                                                          (0 + 2) / 2]D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
[Stage 2:>                                                          (0 + 2) / 2]D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
D:\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\shuffle.py:58: UserWarning: Please install psutil to have better support with spilling
[('背', 1), ('打开', 2)]


注意,就是split以后,这个数组很乃求,可能是map函数的原因,一行一个数组,这样传入到 map或者flatmap就会报错,map--->toomany

flatmap-->list not hash

所以你只能变成一个数组。merge以后就OK了。

好了。动词已拿到,排个序取个top 还是绵绵的吧? 接着就是考虑动词和名词如何关联取出了。讷讷。


 
  
 
  
 
  
 
  
 
  
 
 

你可能感兴趣的:(个人日记)