C++ 离群点检测

ai.hpp

#ifndef ENGINE_AI_N_H
#define ENGINE_AI_N_H
#pragma once
#endif

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;

typedef std::vector Vector;

template  struct Cvt;

template <> struct Cvt {
    static const std::string& to_utf8(const std::string& s) { return s; }
    static const std::string& from_utf8(const std::string& s) { return s; }
};

namespace v {
	struct LightVector: public std::pair
	{
	  using Parent = std::pair;
	  template  LightVector(Arg&& ... arg): Parent(std::forward(arg) ...) {}

	  float *data() const { return first; }
	  size_t size() const { return std::distance(first, second); }
	  bool empty() const  { return first == second; }

	  float& operator[](size_t i) { return *(first + i); }
	  float operator[](size_t i) const { return *(first + i); }
	};

	template  inline float dot(const Vector1&x, const Vector2& y) { 
		int m = x.size(); const float *xd = x.data(), *yd = y.data();
		float sum = 0.0;
		while (--m >= 0) sum += (*xd++) * (*yd++);
		return sum;
	}

	// saxpy: x = x + g * y; x = a * x + g * y
	inline void saxpy(Vector& x, float g, const Vector& y) {
		int m = x.size(); float *xd = x.data(); const float *yd = y.data();
		while (--m >= 0) (*xd++) += g * (*yd++);
	}
	
	inline void unit(Vector& x) {
		float len = std::sqrt(dot(x, x));
		if (len == 0) return;

		int m = x.size(); float *xd = x.data();
		while (--m >= 0) (*xd++) /= len;
	}
}

namespace ai
{
    template 
    struct Word2Vec
    {
        struct Word
        {
            int32_t index_;
            String text_;
            uint32_t count_;
            Word *left_, *right_;
        
            std::vector codes_;
            std::vector points_;
        
            Word(int32_t index, String text, uint32_t count, Word *left = 0, Word *right = 0) : index_(index), text_(text), count_(count), left_(left), right_(right) {}
            Word(const Word&) = delete;
            const Word& operator = (const Word&) = delete;
        };
        typedef std::shared_ptr WordP;
        
        struct Sentence
        {
            std::vector words_;	
            std::vector tokens_;
        };
        typedef std::shared_ptr SentenceP;

        std::vector syn0_, syn1_;
        std::vector syn0norm_;

        std::unordered_map vocab_;
        std::vector words_;

        int layer1_size_;
        int window_;

        //subsampling
        float sample_;

        int min_count_;

        float alpha_, min_alpha_;

        bool phrase_;
        float phrase_threshold_;

        Word2Vec(int size = 100, int window = 5, float sample = 0.001, int min_count = 5, int negative = 0, float alpha = 0.025, float min_alpha = 0.0001) 
            :layer1_size_(size), window_(window), sample_(sample), min_count_(min_count), alpha_(alpha), min_alpha_(min_alpha), phrase_(false), phrase_threshold_(50)
        {}

        bool has(const String& w) const { return vocab_.find(w) != vocab_.end(); }

        int build_vocab(std::vector& sentences) {
            size_t count = 0;
            std::unordered_map vocab;

            for (auto& sentence: sentences) {
                ++count;

                String last_token;
                for (auto& token: sentence->tokens_) {
                    vocab[token] += 1;
                    // add bigram phrases
                    if (phrase_) {
                        if(!last_token.empty()) vocab[last_token + Cvt::from_utf8("_") + token] += 1;
                        last_token = token;
                    }
                }
            }

            if (phrase_) {
                count = 0;
                int total_words = std::accumulate(vocab.begin(), vocab.end(), 0, [](int x, const std::pair& v) { return x + v.second; });

                std::unordered_map phrase_vocab;

                for (auto& sentence: sentences) {
                    ++count;
                    
                    std::vector phrase_tokens;
                    String last_token;
                    uint32_t pa = 0, pb = 0, pab = 0;
                    for (auto& token: sentence->tokens_) {
                        pb = vocab[token];
                        if (!last_token.empty()) {
                            String phrase = last_token + Cvt::from_utf8("_") + token;	
                            pab = vocab[phrase];
                            float score = 0;
                            if (pa >= min_count_ && pb >= min_count_ && pab >= min_count_)
                                score = (pab - min_count_ ) / (float(pa) * pb) * total_words;
                            if (score > phrase_threshold_) {
                                phrase_tokens.emplace_back(phrase);
                                token.clear();
                                phrase_vocab[phrase] += 1;
                            }
                            else {
                                phrase_tokens.emplace_back(last_token);
                                phrase_vocab[last_token] += 1;
                            }
                        }
                        last_token = token;
                        pa = pb;
                    }

                    if (!last_token.empty()) {
                        phrase_tokens.emplace_back(last_token);
                        phrase_vocab[last_token] += 1;
                    }
                    sentence->tokens_.swap(phrase_tokens);
                }
                
                vocab.swap(phrase_vocab);
            }

            int n_words = vocab.size();
            if (n_words <= 1) return -1;

            words_.reserve(n_words);
            auto comp = [](Word *w1, Word *w2) { return w1->count_ > w2->count_; };

            for (auto& p: vocab) {
                uint32_t count = p.second;
                if (count <= min_count_) continue;

                auto r = vocab_.emplace(p.first, WordP(new Word{0, p.first, count}));

                auto it = std::find(words_.begin(), words_.end(), r.first->second.get());
                if (it != words_.end()) {
                    continue;
                } else {
                    words_.emplace_back((r.first->second.get()));
                }
            }
            std::sort(words_.begin(), words_.end(), comp);

            int index = 0;
            for (auto& w: words_) w->index_ = index++;

            n_words = words_.size();
            
            std::vector heap = words_;
            std::make_heap(heap.begin(), heap.end(), comp);

            std::vector tmp;
            for (int i=0; i::from_utf8(""), min1->count_ + min2->count_, min1, min2}));

                heap.emplace_back(tmp.back().get());
                std::push_heap(heap.begin(), heap.end(), comp);
            }

            int max_depth = 0;
            std::list, std::vector>> stack;
            stack.emplace_back(std::make_tuple(heap[0], std::vector(), std::vector()));
            count = 0;
            while (!stack.empty()) {
                auto t = stack.back();
                stack.pop_back();

                Word *word = std::get<0>(t);
                if (word->index_ < n_words) {
                    word->points_ = std::get<1>(t);
                    word->codes_ = std::get<2>(t);
                    max_depth = std::max((int)word->codes_.size(), max_depth);
                }
                else {
                    auto points = std::get<1>(t);
                    points.emplace_back(word->index_ - n_words);
                    auto codes1 = std::get<2>(t);
                    auto codes2 = codes1;
                    codes1.emplace_back(0); codes2.emplace_back(1);
                    stack.emplace_back(std::make_tuple(word->left_, points, codes1));
                    stack.emplace_back(std::make_tuple(word->right_, points, codes2));
                }
            }

            syn0_.resize(n_words);
            syn1_.resize(n_words);

            std::default_random_engine eng(::time(NULL));
            std::uniform_real_distribution rng(0.0, 1.0);
            for (auto& s: syn0_) { 
                s.resize(layer1_size_);
                for (auto& x: s) x = (rng(eng) - 0.5) / layer1_size_;
            }
            for (auto& s: syn1_) s.resize(layer1_size_);

            return 0;
        }

        int train(std::vector& sentences) {
            int total_words = std::accumulate(vocab_.begin(), vocab_.end(), 0, 
                [](int x, const std::pair& p) { return (int)(x + p.second->count_); });
            int current_words = 0;
            float alpha0 = alpha_, min_alpha = min_alpha_;

            std::default_random_engine eng(::time(NULL));
            std::uniform_real_distribution rng(0.0, 1.0);

            size_t n_sentences = sentences.size();

            size_t last_words = 0;
            auto cstart = std::chrono::high_resolution_clock::now();

            #pragma omp parallel for
            for (size_t i=0; i < n_sentences; ++i) {
                auto sentence = sentences[i].get();
                if (sentence->tokens_.empty()) 
                    continue;
                size_t len = sentence->tokens_.size();
                for (size_t i=0; itokens_[i]);
                    if (it == vocab_.end()) continue; 
                    Word *word = it->second.get();
                    // subsampling
                    if (sample_ > 0) {
                        float rnd = (sqrt(word->count_ / (sample_ * total_words)) + 1) * (sample_ * total_words) / word->count_;
                        if (rnd < rng(eng)) continue;
                    }
                    sentence->words_.emplace_back(it->second.get());
                }

                float alpha = std::max(min_alpha, float(alpha0 * (1.0 - 1.0 * current_words / total_words)));
                Vector work(layer1_size_);
                size_t words = train_sentence(*sentence, alpha, work);

                #pragma omp atomic
                current_words += words;

                if (current_words - last_words > 1024 * 100 || i == n_sentences - 1) {
                    auto cend = std::chrono::high_resolution_clock::now();
                    auto duration = std::chrono::duration_cast(cend - cstart).count();
                    // printf("training alpha: %.4f progress: %.2f%% words per sec: %.3fK\n", alpha, current_words * 100.0/total_words, (current_words - last_words) * 1000.0 / duration);
                    last_words = current_words;
                    cstart = cend;
                }
            }

            syn0norm_ = syn0_;
            for (auto& v: syn0norm_) v::unit(v);

            return 0;
        }

        int save(const std::string& file) const {
            std::ofstream out(file, std::ofstream::out);
            out << syn0_.size() << " " << syn0_[0].size() << std::endl;
            
            std::vector words = words_;
            std::sort(words.begin(), words.end(), [](Word *w1, Word *w2) { return w1->count_ > w2->count_; });

            for (auto w: words) {
                out << Cvt::to_utf8(w->text_);
                for (auto i: syn0_[w->index_]) out << " " << i;
                out << std::endl;
            }

            return 0;
        }

        int load(const std::string& file) {
            std::ifstream in(file);
            std::string line;
            if (! std::getline(in, line)) return -1;

            int n_words = 0, layer1_size = 0;
            std::istringstream iss(line);
            iss >> n_words >> layer1_size;

            syn0_.clear(); vocab_.clear(); words_.clear();
            syn0_.resize(n_words);
            for (int i=0; i> text;

                auto p = vocab_.emplace(Cvt::from_utf8(text), WordP(new Word{i, Cvt::from_utf8(text), 0}));
                words_.emplace_back(p.first->second.get());
                syn0_[i].resize(layer1_size);
                for(int j=0; j> syn0_[i][j];
                }
            }
            
            layer1_size_ = layer1_size;

            syn0norm_ = syn0_;
            for (auto& v: syn0norm_) v::unit(v);
            
            return 0;	
        }
        
        std::vector> most_similar(std::vector positive, std::vector negative, int topn) {
            if ((positive.empty() && negative.empty()) || syn0norm_.empty()) return std::vector>{};

            Vector mean(layer1_size_);
            std::vector all_words;
            auto add_word = [&mean, &all_words, this](const String& w, float weight) {
                auto it = vocab_.find(w);		
                if (it == vocab_.end()) return;

                Word& word = *it->second;
                v::saxpy(mean, weight, syn0norm_[word.index_]);

                all_words.push_back(word.index_);
            };

            for (auto& w: positive) add_word(w, 1.0);
            for (auto& w: negative) add_word(w, -1.0);

            v::unit(mean);

            Vector dists;
            std::vector indexes;
            int i=0;

            dists.reserve(syn0norm_.size());
            indexes.reserve(syn0norm_.size());
            for (auto &x: syn0norm_) {
                dists.push_back(v::dot(x, mean));		
                indexes.push_back(i++);
            }

            auto comp = [&dists](int i, int j) { return dists[i] > dists[j]; };
    
            int k = std::min(int(topn+all_words.size()), int(indexes.size())-1);
            auto first = indexes.begin(), last = indexes.begin() + k, end = indexes.end();
            std::make_heap(first, last + 1, comp);
            std::pop_heap(first, last + 1, comp);
            for (auto it = last + 1; it != end; ++it) {
                if (! comp(*it, *first)) continue;
                    *last = *it;
                    std::pop_heap(first, last + 1, comp);
            }
            
            std::sort_heap(first, last, comp);

            std::vector> results;
            for(int i=0, j=0; itext_, dists[indexes[i]]));
                if (++j >= topn) break;
            }

            return results;
        }

        const Vector& word_vector(const String& w) const {
            static Vector nil;
            auto it = vocab_.find(w);
            if (it == vocab_.end()) return nil;
            
            return syn0_[it->second->index_];
        }

        size_t word_vector_size() const { return layer1_size_; }

        const Vector calculate_words_average(const Vector& word_vec1, const Vector& word_vec2) {
            int size = word_vec1.size();

            if (size != word_vec2.size()) {
                if (word_vec1.empty()) return word_vec2;
                return word_vec1;
            }

            Vector result(size);
            for (size_t i = 0; i < size; ++i) result[i] = (word_vec1[i] + word_vec2[i]) / 2.0;

            return result;
        }

    private:
        int train_sentence(Sentence& sentence, float alpha, Vector& work) {
            const int max_size = 1000;
            const float max_exp = 6.0;
            const static std::vector table = [&](){
                    std::vector x(max_size);
                    for (size_t i=0; icodes_.empty()) 
                        continue;
                    int word_index = word->index_;
                    auto& l1 = syn0_[word_index];

                    std::fill(work.begin(), work.end(), 0);
                    for (size_t b=0; b= max_exp) 
                            continue;
            
                        int fi = int((f + max_exp) * (max_size / max_exp / 2.0));
                        
                        f = table[fi];
                        float g = (1 - current.codes_[b] - f) * alpha;

                        v::saxpy(work, g, l2);
                        v::saxpy(l2, g, l1);
                    }
                    v::saxpy(l1, 1.0, work);
                }
                ++count;
            }

            return count;
        }

        float similarity(const String& w1, const String& w2) const {
            auto it1 = vocab_.find(w1), it2 = vocab_.find(w2);
            if (it1 != vocab_.end() && it2 != vocab_.end())
                return v::dot(syn0_[it1->second->index_], syn0_[it2->second->index_]);

            return 0;
        }
    };

    template 
    struct DBSCAN
    {
        struct Model {
            Vector features;   // 特征向量
            int label;         // 类别标签
        };

        Double epsilon;
        size_t minPts;

        DBSCAN(Double epsilon = 0.6, size_t minPts = 1, size_t knn_k = 4) :epsilon(epsilon), minPts(minPts) {}

        std::vector dbscan(const std::vector& sentences) {
            int n = sentences.size();
            std::vector labels(n, -1);
            int cluster = 0;

            for (int i = 0; i < n; ++i) {
                if (labels[i] != -1) continue;

                std::vector neighbors;
                for (int j = i + 1; j < n; ++j) {
                    Double distance = calculate_distance(sentences[i], sentences[j]);
                    if (distance <= epsilon) neighbors.emplace_back(j);
                }
                int size = neighbors.size();
                if (size < minPts) {
                    labels[i] = 0;
                } else {
                    ++cluster;
                    labels[i] = cluster;

                    for (int j = 0; j < size; ++j) {
                        int neighborIndex = neighbors[j];
                        if (labels[neighborIndex] == 0) {
                            labels[neighborIndex] = cluster;
                        }

                        if (labels[neighborIndex] != -1) continue;

                        labels[neighborIndex] = cluster;

                        std::vector newNeighbors;
                        for (int k = neighborIndex + 1; k < n; ++k) {
                            double distance = calculate_distance(sentences[neighborIndex], sentences[k]);
                            if (distance <= epsilon) newNeighbors.emplace_back(k);
                        }

                        if (newNeighbors.size() >= minPts) neighbors.insert(neighbors.end(), newNeighbors.begin(), newNeighbors.end());
                    }
                }
            }

            return labels;
        }

        int predict(const std::vector& model, const Vector& sentence) {
            int n = model.size();

            for (int i = 0; i < n; ++i) {
                double distance = calculate_distance(model[i].features, sentence);
                if (distance <= epsilon-2) {
                    if (model[i].label != 0) {
                        return model[i].label;
                    }
                }
            }

            return 0;
        }

        std::vector create_model(const std::vector& baseline, const std::vector& labels) {
            std::vector model;
            size_t numSamples = std::min(baseline.size(), labels.size());

            for (size_t i = 0; i < numSamples; ++i) {
                Model m;
                m.features = baseline[i];
                m.label = labels[i];
                model.emplace_back(m);
            }

            return model;
        }

        ~DBSCAN() {}

    private:
        Double calculate_distance(const Vector& v1, const Vector& v2) {
            Double distance = 0.0;

            int size = v1.size();
            for (int i = 0; i < size; ++i) {
                Double diff = v1[i] - v2[i];
                distance += std::abs(diff);  
            }

            return distance;
        }
    };
} // namespace ai

main.cpp

// OpenMP is required..
// g++-4.8 -ow2v -fopenmp -std=c++0x -Ofast -march=native -funroll-loops  main.cpp  -lpthread

#include "ai.hpp"
#include 
#include 
#include 
#include 
#include 

#define MODEL_SIZE 1000

std::vector::SentenceP> get_sentences (const std::string& filename) {
	using Sentence = ai::Word2Vec::Sentence;
	using SentenceP = ai::Word2Vec::SentenceP;
	
	SentenceP sentence(new Sentence);

	std::ifstream ifs;
	ifs.open(filename, std::ios::in);
	std::string line;
	std::list sentence_list;

	while (std::getline(ifs, line)) {
		std::string token;
		std::string before_token;
		std::istringstream iss(line);

		while (iss >> token) {
			sentence->tokens_.push_back(std::move(token));
		}

		sentence_list.push_back(std::move(sentence));
		sentence.reset(new Sentence);
	}

	if (!sentence->tokens_.empty()) sentence_list.push_back(std::move(sentence));

	std::vector sentences(sentence_list.begin(), sentence_list.end());

	return sentences;
}

int train_w2v(ai::Word2Vec &w2v, std::vector::SentenceP>& sentences) {
	if (w2v.build_vocab(sentences) != 0) return -1;
	if (w2v.train(sentences) != 0) return -1;
	if (w2v.save("word2vec") != 0) return -1;

	return 0;
}

std::vector get_sentences_vector(ai::Word2Vec &w2v, std::vector::SentenceP>& sentences) {
	std::vector sentences_vector;
	for (auto sentence = sentences.begin(); sentence != sentences.end(); ++sentence) {
		Vector sentence_vector;
		sentence_vector.reserve((*sentence)->tokens_.size());
		for (auto token = (*sentence)->tokens_.begin(); token != (*sentence)->tokens_.end(); ++token) 
			if (w2v.has(*token)) sentence_vector = w2v.calculate_words_average(sentence_vector, w2v.word_vector(*token));
		
		if (!sentence_vector.empty()) sentences_vector.push_back(std::move(sentence_vector));
	}

	return sentences_vector;
}

int main(int argc, const char *argv[])
{
	std::shared_ptr> w2v = std::make_shared>(32);
	w2v->sample_ = 0;
	w2v->min_count_= 1;
	// w2v->window_ = 10;

	std::vector::SentenceP> sentences = get_sentences("sentences.txt");
	if (train_w2v((*w2v), sentences) != 0) return -1;

	std::vector sentences2vector = get_sentences_vector((*w2v), sentences);
	std::cout << "sentence vector size:" << sentences2vector.size() << std::endl;

    ai::DBSCAN dbscan;
	dbscan.epsilon = 5.03;
	dbscan.minPts = 3;

	std::vector second_data;
	using second_Sentence = ai::Word2Vec::Sentence;
	using second_SentenceP = ai::Word2Vec::SentenceP;
	
	second_SentenceP sentence(new second_Sentence);
	std::vector second_sentences_;

	int num = 0;
	size_t size1 = sentences2vector.size();
	for (int i = 0;i < 50; i+=2) {
		std::vector baseline(sentences2vector.begin() + i*MODEL_SIZE, sentences2vector.begin() + (i+1)*MODEL_SIZE);
		std::vector labels = dbscan.dbscan(baseline);

		std::set labels_set(labels.begin(), labels.end());
		std::vector average_baseline;

		std::unordered_map baseline_map;

		for (size_t i = 0; i < labels.size(); i++) {
			baseline_map[labels[i]] = w2v->calculate_words_average(baseline_map[labels[i]], baseline[i]);
		}

		std::vector new_labels;
		std::vector new_baseline;

		for (const auto& pair : baseline_map) {
			new_labels.push_back(pair.first);
			new_baseline.push_back(pair.second);
		}

		std::vector::Model> model = dbscan.create_model(baseline, labels);
		// std::vector::Model> model = dbscan.create_model(new_baseline, new_labels);
		Vector before_log;

		for (int j = 0; j < MODEL_SIZE; j++) {
			if (j+(i+1)*MODEL_SIZE == size1) goto over;
 			Vector log = sentences2vector.at(j+(i+1)*MODEL_SIZE);

			if (!dbscan.predict(model, log)) {
				second_data.push_back(log);
				second_sentences_.push_back(sentences[j+(i+1)*MODEL_SIZE]);
				num++;
				std::cout << "第一次检测的离群点: ";
				const ai::Word2Vec::SentenceP& sentence = sentences[j+(i+1)*MODEL_SIZE];
				const ai::Word2Vec::Sentence& tokens = *sentence;
				for (const std::string& token : tokens.tokens_) std::cout << token << " ";
				std::cout << std::endl;
			}
		}
		std::cout << std::endl;
	}

over:
	std::cout << "num:" << num << std::endl;

	std::vector::SentenceP> second_sentences(second_sentences_.begin(), second_sentences_.end());
	num = 0;
	std::vector labels = dbscan.dbscan(second_data);
	for(int i = 0; i < labels.size(); i++) {
		if (labels[i] == 0) {
			num++;
			std::cout << "第二次检测的离群点: ";
			const ai::Word2Vec::SentenceP& sentence = second_sentences[i];
			const ai::Word2Vec::Sentence& tokens = *sentence;
			for (const std::string& token : tokens.tokens_) std::cout << token << " ";
			std::cout << std::endl;
		}
	}
	std::cout << "num:" << num << std::endl;
}

sentences.txt

abc def ghi sdff ghrio oongs
sdgg sdgf jong wongg lllg
jjogg weg egege geron osdg lsdg
...

你可能感兴趣的:(C/C++,人工智能,c++,算法,开发语言)