上班之后完成的第一个项目

今天下午,终于在3月上班后,完成了第一个项目。

项目的大致内容是,将wikipedia上的某个特定分类(n多个子分类)的所有内容抓取下来,放到gensim中的word2vector训练处模型。

  再将不同的keywords的simword叠加起来,后面的similarity相加,取前topN个,生成一个重复的simword template。


中间进行了很多种尝试,从公司的ES,到google search,最后选定Wikipedia作为语料库。技术都不难,但是过程中趟了一些坑。比如说语料库太大,怎么训练模型等等。

粘上代码,用以将来再看的时候,能会心一笑。

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from nltk.corpus import stopwords
from util.logger import cluster_log
from util.date_util import DateUtil
import settings as job_settings
import gensim


class FilterTemplate(object):
    def __init__(self, category):
        # TODO: not defined campaign category
        self.category = category
        self.topN_similarity = job_settings.topN_similarity
        self.topN_simword = job_settings.topN_simword
        self.X = job_settings.X
        self.model_root_path = job_settings.model_root_path

        self.date_util = DateUtil()

    def model_path(self):
        # campaign category has different model path
        # TODO: not defined all the category
        campaign_category = {
            "technology": self.model_root_path + "/technology/technology_model",
            "manifacturing": self.model_root_path + "/manifacturing/manifacturing_model"
        }
        if self.category not in campaign_category.keys():
            # if category don't exist process throws an exception
            raise Exception("Not defined category: {0}, failed to get model path".format(self.category))
        else:
            return campaign_category[self.category]

    def model_keyword_format(self, keywords_list):
        # generate keywords format model needed
        english_stopwords = stopwords.words('english')
        word_bag = []
        for keyword in keywords_list:
            try:
                if keyword.split(' ') > 1:
                    word_bag.extend([word for word in keyword.split(' ')])
                else:
                    word_bag.extend(keyword)
            except:
                cluster_log.info("ERROR KEYWORD: " + keyword)
                continue
        # dispose of replace word
        return [[word] for word in set(word_bag) if word not in english_stopwords and word.isalpha()]

    def simword_similarity(self, word_list):
        # get simword and similarity of the word in word2vector model
        model = gensim.models.Word2Vec.load(self.model_path())
        all_simword_list = []
        cluster_log.info("strat to find simword and similarity in word2vector model, timestamp: {timstamp}".format(
                timstamp = self.date_util.get_current_time_str()
        ))
        for word in word_list:
            try:
                # topN_simword means to get the top n simword
                simword_similarity = model.most_similar(word, topn = self.topN_simword)
                # save the word (word similarity = 1) into simword similarity list
                word_similarity = (word, 1)
                simword_similarity.append(word_similarity)
                all_simword_list.append(simword_similarity)
            except:
                cluster_log.info("{0} can't find simword in word2vector model".format(word))
                continue
        cluster_log.info("finish to find simword and similarity in word2vector model, timestamp: {timstamp}".format(
                timstamp = self.date_util.get_current_time_str()
        ))
        return all_simword_list

    def clean_simword(self, simword):
        # simword may be number or mojibake, needs to clean up
        return True if simword[0].isalpha() and not simword[0].isdigit() and simword[1] != 1 else False

    def non_replace_simword(self, all_simword_list):
        # generate non replace simword and its total similarity in all simword list
        # topN_similarity means get the top N similarity
        non_replace_simword = {}
        for simword_list in all_simword_list:
            word = [w[0] for w in simword_list if w[1] == 1]
            try:
                for simword in simword_list:
                    if self.clean_simword(simword):
                        if not non_replace_simword.has_key(simword[0]):
                            non_replace_simword[simword[0]] = simword[1]
                        else:
                            non_replace_simword[simword[0]] += simword[1]
                    else:
                        continue
            except:
                cluster_log.info("{0} error in all_simword_list ".format(word[0]))
                continue
        return sorted(non_replace_simword.iteritems(), key = lambda simword: simword[1], reverse = True)[
               :self.topN_similarity]

    def global_template(self, sorted_set):
        # generate global template
        global_template = ""
        for simword in sorted_set:
            try:
                if simword[1] != 0:
                    frequency = int(simword[1] * self.X)
                else:
                    frequency = 1
                global_template += (simword[0] + ' ') * frequency
            except:
                cluster_log.info("Simword: {0} ERROR".format(simword[0]))
                continue
        return global_template

    def generate_global_template(self, keywords_list):
        global_template = self.global_template(
                self.non_replace_simword(
                        self.simword_similarity(
                                self.model_keyword_format(keywords_list)
                        )))
        if len(global_template) == 0:
            cluster_log.error("failed to generate global template")
        return global_template


你可能感兴趣的:(python,自然语言处理)