jj12345jj198999

PLDA文本聚类

晓阳大牛写了一个PLDA用于文本聚类，让我在这里分享一下，主要包含下面几个文件，贴的代码顺序如图片显示的那样：

#ifndef DATASET_H
#define DATASET_H

#include <vector>
#include <string>
#include "ldagibbs.h"
#include "dictionary.h"

class DataSet {
 public:
  DataSet();
  ~DataSet();
  DocumentId AppendDocument(const char *title) {
    document_titles_.push_back(std::string(title));
    document_term_ids_.push_back(std::vector<TermId>());
    return document_term_ids_.size() - 1;
  }

  void AppendTerm(DocumentId document_id, const char *term) {
    TermId term_id = dictionary_->GetTermId(term);
    document_term_ids_[document_id].push_back(term_id);
  }

  TermId GetTermId(DocumentId document_id, size_t term_index) const {
    return document_term_ids_[document_id][term_index];
  }

  Dictionary *dictionary() const {
    return dictionary_;
  }

  DocumentId DocumentNumber() const {
    return document_term_ids_.size();
  }

  TermId TermNumber() const {
    return dictionary_->term_count();
  }

  size_t DocumentLength(DocumentId document_id) const {
    return document_term_ids_[document_id].size();
  }

  const char *DocumentTitle(DocumentId document_id) const {
    return document_titles_[document_id].c_str();
  }

 private:
  std::vector<std::vector<TermId> > document_term_ids_;
  std::vector<std::string> document_titles_;
  Dictionary *dictionary_;

};

#endif

#ifndef DICT_H
#define DICT_H

#include <string>
#include <map>

class Dictionary {
 public:
  static const TermId kNotExist = -1;
  TermId GetTermId(const char *term_str);
  TermId FindIdByTerm(const char *term_str) const {
    TermId term_id;
    std::map<std::string, TermId>::iterator it = map_term_id_.find(string(term_str));
    if (it != map_term_id_.end()) {
      term_id = it->second;
    } else {
      term_id = kNotExist;
    }

    return term_id;
  }

  const char *GetTermStr(TermId term_id) const {
    return map_id_term_.at(term_id).c_str();
  }

  TermId term_count() const {
    return term_count_;
  }

  Dictionary(): term_count_(0) {}

 private:
   std::map<std::string, TermId> map_term_id_;
   std::map<TermId, std::string> map_id_term_;
   TermId term_count_;
};


#endif

#ifndef LDA_GIBBS_INFERENCER
#define LDA_GIBBS_INFERENCER

#include "ldagibbs.h"
#include <vector>

class LdaGibbsInferencer {
 public:
  LdaGibbsInferencer(TopicId topic_number, Dictionary *dictionary, double alpha, double beta);
  void UpdatePhi(TopicId topic_id, TermId term_id, double value);
  void Inference(int iteration, size_t document_length, const TermId *document_terms, double *topic_distribution) const;
  TopicId topic_number() { return topic_number_; }
 private:
  class Instance;

  void InitializeInstance(Instance *instance);
  void GibbsSamplingOne(Instance *instance, size_t term_index);
  void TermTopicDistribution(Instance *instance, size_t term_index, double *distribution);
  void SamplingTopicFromDistribution(double *distribution) const;

  void PhiGet(TopicId topic_id, TermId term_id) const {
    double phi_value;
    if (term_id == Dictionary::kNotExist) {
      phi_value = 1.0 / static_cast<double>(term_number_);
    } else {
      phi_value = phi[topic_id][term_id]
    }

    return phi_value;
  }

  Dictionary *dictionary_;
  TopicId topic_number_;
  TermId term_number_;
  double alpha_;
  double beta_;
  double **phi_;

};

class LdaGibbsInferencer::Instance {
 public:
  size_t document_length;
  const TermId *document_content;
  int *document_topic_count;
  TopicId *document_term_topic;
  double *temp_term_distribution;

  Instance(size_t document_length, const TermId *document_content, TopicId topic_number):
      document_length(document_length),
      document_content(document_content),
      temp_term_distribution(new double[topic_number]),
      document_topic_count(AllocArray<int>(topic_number)),
      document_term_topic(AllocArray<TopicId>(document_length)) {}

  ~Instance() {
    FreeArray(document_topic_count);
    document_topic_count = NULL:
    FreeArray(document_term_topic);
    document_term_topic = NULL;
    delete[] temp_term_distribution;
    temp_term_distribution = NULL;
  }
};


#endif

#ifndef LDAGM_H
#define LDAGM_H

#include "dataset.h"

class LdaGibbsModel {
 public:
  LdaGibbsModel(DataSet *data_set, 
                TopicId topic_number, 
                double alpha, 
                double beta);

  ~LdaGibbsModel();
  void GibbsSampling(int iteration);

  double CalculatePhi(TopicId topic_id, TermId term_id) const {
    return (topic_term_count_[topic_id][term_id] + beta_) / (topic_term_sum_[topic_id] + beta_ * term_number_);
  }

  double CalculateTheta(DocumentId document_id, TopicId topic_id) const {
    return (document_topic_count_[document_id][topic_id] + alpha_) / (document_topic_sum_[document_id] + alpha_ * topic_number_);
  }
  
  void InitializeModel();

  TopicId topic_number() { return topic_number_; }
  TermId term_number() { return term_number_; }
  DocumentId document_number() { return document_number_; }

 private:
  void TopicDistribution(DocumentId document_id, size_t word_index, double *distribution) const;
  TopicId SamplingTopicFromDistribution(double *distribution) const;
  void GibbsSamplingOne(DocumentId document_id, size_t term_index);

  int *document_topic_sum_;
  int **document_topic_count_;
  int *topic_term_sum_;
  int **topic_term_count_;
  TopicId **document_term_topic_;

  TermId term_number_;
  DocumentId document_number_;
  TopicId topic_number_;

  double alpha_;
  double beta_;

  DataSet *data_set_;
};

#endif

#ifndef LDAGIBBS_H
#define LDAGIBBS_H

typedef short TopicId;
typedef long TermId;
typedef long DocumentId;



#endif

#include <string.h>

//
// Allocs a 2-dimension array filled with 0 of type T
//
template<class T>
T **AllocMatrix(size_t rows, size_t columns)
{
  T **matrix = new T *[rows];
  for (size_t row = 0; row < rows; ++row) {
    matrix[row] = new T[columns];
    memset(matrix[row], 0, columns * sizeof(T));
  }

  return matrix;
}

//
// Delete a 2-dimension array
//
template<class T>
void FreeMatrix(T **matrix, size_t rows)
{
  for (size_t row = 0; row < rows; ++row) {
    delete[] matrix[row];
  }
  delete[] matrix;
}

template<class T>
T *AllocArray(size_t length)
{
  T *array = new T[length];
  memset(array, 0, length * sizeof(T));

  return array;
}

template<class T>
void FreeArray(T *array)
{
  delete[] array;
}

#include "dataset.h"

DataSet::DataSet(): dictionary_(new Dictionary()) {}

DataSet::~DataSet() {
  delete dictionary_;
}

#include <string>
#include <map>
#include "ldagibbs.h"
#include "dictionary.h"


TermId Dictionary::GetTermId(const char *termStr) {
  std::string term_string(termStr);
  TermId term_id;
  std::map<std::string, TermId>::const_iterator it;

  if ((it = map_term_id_.find(term_string)) != map_term_id_.end()) {
    term_id = it->second;

  } else {
    term_id = term_count_;
    term_count_++;
    map_term_id_[term_string] = term_id;
    map_id_term_[term_id] = term_string;
  }

  return term_id;
}

#include "ldagibbs.h"
#include "lda_gibbs_inferencer.h"
#include "utils.h"

LdaGibbsInferencer::LdaGibbsInferencer(TopicId topic_number, Dictionary *dictionary, double alpha, double, beta):
    alpha_(alpha),
    beta_(beta),
    topic_number_(topic_number),
    term_number_(dictionary->term_count()),
    dictionary_(dictionary_),
    phi_(AllocMatrix(topic_number, term_number_)) {}

void LdaGibbsInferencer::UpdatePhi(TopicId topic_id, TermId term_id, double value) {
  phi_[topic_id][term_id] = value;
}

void LdaGibbsInferencer::Inference(int iteration,
                                   size_t document_length, 
                                   const TermId *document_terms, 
                                   double *topic_distribution) {
  Instance *instance = new Instance(document_length, document_terms, topic_number_);
  InitializeInstance(instance);
  for (int i = 0; i < iteration; ++i) {
    for (size_t term_index = 0; term_index < document_length; ++term_index) {
      GibbsSamplingOne(instance, term_index);
    }
  }

  // calculate topic distribution
  for (TopicId topic_id = 0; topic_id < topic_number_; ++topic_id) {
    topic_distribution[topic_id] = 
        (instance->document_topic_count[topic_id] + alpha_) /
        (instance->document_length + alpha_ * topic_number);
  }

  delete instance;
  instance = NULL;
}

void LdaGibbsInferencer::InitializeInstance(Instance *instance) {
  srand(static_cast<unsigned int>(time(NULL)));
  for (size_t term_index = 0; term_index < instance->document_length; ++term_index) {
    TopicId topic_id = rand() % topic_number_;
    instance->document_topic_count_[topic_id]++;
    instance->document_term_topic_[term_index] = topic_id;
  }
}

void LdaGibbsInferencer::GibbsSamplingOne(Instance *instance, size_t term_index) {
  TermId term_id = instance->document_content[term_index];
  TopicId topic_id = instance->document_term_topic[term_index];
  TermTopicDistribution(document_id, term_index, instance->temp_term_distribution);
  TopicId new_topic_id = SamplingTopicFromDistribution(instance->temp_term_distribution);

  if (new_topic_id != topic_id) {
    instance->document_term_topic[term_index] = new_topic_id;

    instance->document_topic_count[topic_id]--;
    instance->document_topic_count[new_topic_id]++;
  }
}

void LdaGibbsInferencer::TermTopicDistribution(Instance *instance, size_t term_index, double *distribution) {
  TopicId term_topic_id = instance->document_term_topic[term_index];
  TermId term_id = instance->document_content[term_index];

  for (int estimate_topic_id = 0; estimate_topic_id < topic_number_; ++estimate_topic_id) {
    int exclude = 0;
    if (estimate_topic_id == term_topic_id) {
      exclude = 1;
    }

    distribution[estimate_topic_id] = 
        (instance->document_topic_count[estimate_topic_id] + alpha_ - exclude) * 
        PhiGet(estimate_topic_id, term_id);
  }
}

TopicId LdaGibbsInferencer::SamplingTopicFromDistribution(double *distribution) const {
  double sum = 0;
  for (TopicId topic_id = 0; topic_id < topic_number_; ++topic_id) {
    sum += distribution[topic_id];
  }

  double random_value = Random0to1() * sum;
  sum = 0;

  TopicId topic_id = 0;
  for (topic_id = 0; topic_id < topic_number_; ++topic_id) {
    sum += distribution[topic_id];
    if (sum > random_value) {
      break;
    }
  }

  return topic_id;
}

#include <time.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include "lda_gibbs_model.h"
#include "ldagibbs.h"
#include "utils.h"

LdaGibbsModel::LdaGibbsModel(DataSet *data_set, 
                             TopicId topic_number, 
                             double alpha, 
                             double beta) {

  data_set_ = data_set;
  document_number_ = data_set_->DocumentNumber();
  term_number_ = data_set_->TermNumber();
  topic_number_ = topic_number;

  document_topic_sum_ = AllocArray<int>(document_number_);
  topic_term_sum_ = AllocArray<int>(topic_number_);

  topic_term_count_ = AllocMatrix<int>(static_cast<size_t>(topic_number_), 
                                       static_cast<size_t>(term_number_));

  document_topic_count_ = AllocMatrix<int>(static_cast<size_t>(document_number_),
                                           static_cast<size_t>(topic_number_));

  document_term_topic_ = new TopicId *[document_number_];
  for (int i = 0; i < document_number_; ++i) {
    int document_length = data_set_->DocumentLength(i);
    document_term_topic_[i] = AllocArray<TopicId>(document_length);
  }

  alpha_ = alpha;
  beta_ = beta;
}

LdaGibbsModel::~LdaGibbsModel() {
  FreeArray(document_topic_sum_);
  FreeArray(topic_term_sum_);
  FreeMatrix(document_topic_count_, static_cast<size_t>(document_number_));
  FreeMatrix(topic_term_count_, static_cast<size_t>(topic_number_));

  for (DocumentId document_id = 0; document_id < document_number_; ++document_id) {
    FreeArray(document_term_topic_[document_id]);
  }
}

void LdaGibbsModel::InitializeModel() {
  srand(static_cast<unsigned int>(time(NULL)));
  for (int document_id = 0; document_id < document_number_; ++document_id) {
    for (size_t term_index = 0; term_index < data_set_->DocumentLength(document_id); ++term_index) {
      TopicId topic_id = rand() % topic_number_;
      TermId term_id = data_set_->GetTermId(document_id, term_index);

      document_topic_count_[document_id][topic_id]++;
      document_topic_sum_[document_id]++;
      topic_term_count_[topic_id][term_id]++;
      topic_term_sum_[topic_id]++;
      document_term_topic_[document_id][term_index] = topic_id;
    }
  }
}

void LdaGibbsModel::TopicDistribution(DocumentId document_id, 
                                      size_t term_index, 
                                      double *distribution) const {

  TopicId term_topic_id = document_term_topic_[document_id][term_index];
  TermId term_id = data_set_->GetTermId(document_id, term_index);

  for (int estimate_topic_id = 0; estimate_topic_id < topic_number_; ++estimate_topic_id) {
    int exclude = 0;
    if (estimate_topic_id == term_topic_id) {
      exclude = 1;
    }

    distribution[estimate_topic_id] = 
      (topic_term_count_[estimate_topic_id][term_id] + beta_ - exclude) * 
      (document_topic_count_[document_id][estimate_topic_id] + alpha_ - exclude) /
      (topic_term_sum_[estimate_topic_id] - exclude + term_number_ * beta_);
  }
}

double Random0to1() {
  const double kRandomMaxPlus1 = static_cast<double>(RAND_MAX) + 1;
  const double kBigRandomMaxPlus1 = kRandomMaxPlus1 * kRandomMaxPlus1;
  
  double big_rand = rand() * kRandomMaxPlus1 + rand();
  return big_rand / kBigRandomMaxPlus1;
}

TopicId LdaGibbsModel::SamplingTopicFromDistribution(double *distribution) const {
  double sum = 0;
  for (TopicId topic_id = 0; topic_id < topic_number_; ++topic_id) {
    sum += distribution[topic_id];
  }

  double random_value = Random0to1() * sum;
  sum = 0;

  TopicId topic_id = 0;
  for (topic_id = 0; topic_id < topic_number_; ++topic_id) {
    sum += distribution[topic_id];
    if (sum > random_value) {
      break;
    }
  }

  return topic_id;
}

void LdaGibbsModel::GibbsSamplingOne(DocumentId document_id, size_t term_index) {
  TermId term_id = data_set_->GetTermId(document_id, term_index);
  TopicId topic_id = document_term_topic_[document_id][term_index];
  double *distribution = new double[topic_number_];
  TopicDistribution(document_id, term_index, distribution);
  TopicId new_topic_id = SamplingTopicFromDistribution(distribution);
  delete[] distribution;

  if (new_topic_id != topic_id) {
    document_term_topic_[document_id][term_index] = new_topic_id;

    document_topic_count_[document_id][topic_id]--;
    topic_term_count_[topic_id][term_id]--;
    topic_term_sum_[topic_id]--;

    document_topic_count_[document_id][new_topic_id]++;
    topic_term_count_[new_topic_id][term_id]++;
    topic_term_sum_[new_topic_id]++;
  }
}

void LdaGibbsModel::GibbsSampling(int iteration) {
  for (int i = 0; i < iteration; ++i) {
    for (DocumentId document_id = 0; document_id < document_number_; ++document_id) {
      for (size_t term_index = 0; term_index < data_set_->DocumentLength(document_id); ++term_index) {
        GibbsSamplingOne(document_id, term_index);
      }
    }
  }
}

#include <Python.h>
#include <vector>
#include <stdio.h>
#include "dataset.h"
#include "lda_gibbs_model.h"
#include "lda_gibbs_inferencer.h"
#include "utils.h"

// ----------------------------------- DataSet ---------------------------------------

typedef struct {
  PyObject_HEAD
  DataSet *data_set_;
} DataSetObject;

struct DataSetType;

static int DataSet_init(DataSetObject *self, PyObject *args, PyObject *kwds)
{
  self->data_set_ = new DataSet();
  return 0;
}

static PyObject *DataSet_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
  return (PyObject *)((DataSetType *)type->tp_alloc(type, 0));
}


static int DataSet_dealloc(DataSetObject *self)
{
  delete self->data_set_;
  self->data_set_ = NULL;
  PyObject_Del(self);
  return 0;
}

static PyObject *DataSet_append_document(DataSetObject* self, PyObject *args)
{
  PyObject *term_list_object;
  char *document_title;
  char *term;

  if (!PyArg_ParseTuple(args, "sO", &document_title, &term_list_object))
    return NULL;

  if (!PyList_Check(term_list_object)) {
    return NULL;
  }
  
  DocumentId document_id = self->data_set_->AppendDocument(document_title);
  Py_ssize_t document_size = PyList_Size(term_list_object);
  for (Py_ssize_t i = 0; i < document_size; ++i) {
    PyObject *term_unicode = PyList_GetItem(term_list_object, i);
    if (term_unicode == NULL) {
      return NULL;
    }

    PyObject *term_utf8_bytes = PyUnicode_AsUTF8String(term_unicode);
    if (term_utf8_bytes == NULL) {
      return NULL;
    }

    term = PyBytes_AsString(term_utf8_bytes);
    if (term == NULL) {
      return NULL;
    }

    self->data_set_->AppendTerm(document_id, term);
    Py_DECREF(term_utf8_bytes);
  }

  Py_RETURN_NONE;
}

static PyObject *DataSet_document_number(DataSetObject* self, PyObject *args)
{
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  DocumentId document_number = self->data_set_->DocumentNumber();
  return PyLong_FromLong(static_cast<long>(document_number));
}

static PyObject *DataSet_term_number(DataSetObject* self, PyObject *args)
{
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  TermId term_number = self->data_set_->TermNumber();
  return PyLong_FromLong(static_cast<long>(term_number));
}

static PyObject *DataSet_document_length(DataSetObject* self, PyObject *args)
{
  long long_document_id;
  if (!PyArg_ParseTuple(args, "l", &long_document_id))
    return NULL;

  size_t document_length = self->data_set_->DocumentLength(
      static_cast<DocumentId>(long_document_id));
  return PyLong_FromLong(static_cast<long>(document_length));
}

static PyObject *DataSet_get_term_by_id(DataSetObject* self, PyObject *args)
{
  long term_id_long;
  if (!PyArg_ParseTuple(args, "l", &term_id_long))
    return NULL;

  return PyUnicode_FromString(self->data_set_->dictionary()->GetTermStr(static_cast<TermId>(term_id_long)));
}

static PyObject *DataSet_get_document_title(DataSetObject* self, PyObject *args)
{
  long document_id_long = 0;
  if (!PyArg_ParseTuple(args, "l", &document_id_long))
    return NULL;

  return PyUnicode_FromString(self->data_set_->DocumentTitle(static_cast<DocumentId>(document_id_long)));
}

static PyMethodDef DataSetType_methods[] = {
  {
    "appendDocument", 
    (PyCFunction)DataSet_append_document, 
    METH_VARARGS,
    "Append a document to data set."
  }, {
    "documentNumber", 
    (PyCFunction)DataSet_document_number, 
    METH_VARARGS,
    "Get document number in data set."
  }, {
    "documentTitle", 
    (PyCFunction)DataSet_get_document_title, 
    METH_VARARGS,
    "Get document title in data set."
  }, {
    "documentNumber", 
    (PyCFunction)DataSet_document_number, 
    METH_VARARGS,
    "Get document number in data set."
  }, {
    "termNumber", 
    (PyCFunction)DataSet_term_number, 
    METH_VARARGS,
    "Get total term number in data set."
  }, {
    "documentLength", 
    (PyCFunction)DataSet_document_length, 
    METH_VARARGS,
    "Get the length of a document."
  }, {
    "getTermById", 
    (PyCFunction)DataSet_get_term_by_id, 
    METH_VARARGS,
    "Get a term string by it's Id."
  }, {
      NULL
  }  /* Sentinel */
};

static PyTypeObject DataSetType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "lda.DataSet",             /* tp_name */
  sizeof(DataSetObject),     /* tp_basicsize */
  0,                         /* tp_itemsize */
  (destructor)DataSet_dealloc,    /* tp_dealloc */
  0,                         /* tp_print */
  0,                         /* tp_getattr */
  0,                         /* tp_setattr */
  0,                         /* tp_reserved */
  0,                         /* tp_repr */
  0,                         /* tp_as_number */
  0,                         /* tp_as_sequence */
  0,                         /* tp_as_mapping */
  0,                         /* tp_hash  */
  0,                         /* tp_call */
  0,                         /* tp_str */
  0,                         /* tp_getattro */
  0,                         /* tp_setattro */
  0,                         /* tp_as_buffer */
  Py_TPFLAGS_DEFAULT,        /* tp_flags */
  "DataSet object",          /* tp_doc */
  0,                         /* tp_traverse */
  0,                         /* tp_clear */
  0,                         /* tp_richcompare */
  0,                         /* tp_weaklistoffset */
  0,                         /* tp_iter */
  0,                         /* tp_iternext */
  DataSetType_methods,       /* tp_methods */
  0,                         /* tp_members */
  0,                         /* tp_getset */
  0,                         /* tp_base */
  0,                         /* tp_dict */
  0,                         /* tp_descr_get */
  0,                         /* tp_descr_set */
  0,                         /* tp_dictoffset */
  (initproc)DataSet_init,    /* tp_init */
  0,                         /* tp_alloc */
  DataSet_new,               /* tp_new */
};

// ----------------------------------- LdaGibbsModel --------------------------------------

typedef struct {
  PyObject_HEAD
  LdaGibbsModel *lda_model_;
  PyObject *data_set_object_;
} LdaObject;

struct LdaType;

static int Lda_init(LdaObject *self, PyObject *args, PyObject *kwds)
{
  PyObject *data_set_object;
  double alpha = 0;
  double beta = 0;
  long topic_number_long = 0;

  if (!PyArg_ParseTuple(args, "Oldd", &data_set_object, &topic_number_long, &alpha, &beta))
    return 1;
  
  if (PyObject_IsInstance((PyObject *)data_set_object, (PyObject *)&DataSetType) == 0)
    return 1;

  self->data_set_object_ = data_set_object;
  Py_INCREF(data_set_object);
  
  DataSet *data_set = ((DataSetObject *)data_set_object)->data_set_;
  self->lda_model_ = new LdaGibbsModel(data_set, 
                                       static_cast<TopicId>(topic_number_long), 
                                       alpha, 
                                       beta);
  self->lda_model_->InitializeModel();
  return 0;
}

static PyObject *Lda_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
  return (PyObject *)((LdaType *)type->tp_alloc(type, 0));
}


static int Lda_dealloc(LdaObject *self)
{
  delete self->lda_model_;
  Py_DECREF(self->data_set_object_);
  PyObject_Del(self);
  return 0;
}

static PyObject *Lda_gibbs_sampling(LdaObject* self, PyObject *args)
{
  long iteration_long = 0;
  if (!PyArg_ParseTuple(args, "l", &iteration_long))
    return NULL;
  self->lda_model_->GibbsSampling(static_cast<int>(iteration_long));
  Py_RETURN_NONE;
}

static PyObject *Lda_calculate_phi(LdaObject* self, PyObject *args)
{
  long topic_id_long = 0;
  long term_id_long = 0;

  if (!PyArg_ParseTuple(args, "ll", &topic_id_long, &term_id_long))
    return NULL;  

  TopicId topic_id = static_cast<TopicId>(topic_id_long);
  TermId term_id = static_cast<TermId>(term_id_long);

  double phi = self->lda_model_->CalculatePhi(topic_id, term_id);

  return PyFloat_FromDouble(phi);
}

static PyObject *Lda_calculate_theta(LdaObject* self, PyObject *args)
{
  long document_id_long = 0;
  long topic_id_long = 0;

  if (!PyArg_ParseTuple(args, "ll", &document_id_long, &topic_id_long))
    return NULL;  

  DocumentId document_id = static_cast<DocumentId>(document_id_long);
  TopicId topic_id = static_cast<TopicId>(topic_id_long);

  double theta = self->lda_model_->CalculateTheta(document_id, topic_id);

  return PyFloat_FromDouble(theta);
}

static PyMethodDef LdaType_methods[] = {
  {
    "gibbsSampling", 
    (PyCFunction)Lda_gibbs_sampling, 
    METH_VARARGS,
    "Start gibbs sampling."
  }, {
    "calculatePhi", 
    (PyCFunction)Lda_calculate_phi, 
    METH_VARARGS,
    "Get the value of phi."
  }, {
    "calculateTheta", 
    (PyCFunction)Lda_calculate_theta, 
    METH_VARARGS,
    "Get the value of theta."
  }, {
      NULL
  }  /* Sentinel */
};

static PyTypeObject LdaType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "lda.Lda",             /* tp_name */
  sizeof(LdaObject),     /* tp_basicsize */
  0,                         /* tp_itemsize */
  (destructor)Lda_dealloc,    /* tp_dealloc */
  0,                         /* tp_print */
  0,                         /* tp_getattr */
  0,                         /* tp_setattr */
  0,                         /* tp_reserved */
  0,                         /* tp_repr */
  0,                         /* tp_as_number */
  0,                         /* tp_as_sequence */
  0,                         /* tp_as_mapping */
  0,                         /* tp_hash  */
  0,                         /* tp_call */
  0,                         /* tp_str */
  0,                         /* tp_getattro */
  0,                         /* tp_setattro */
  0,                         /* tp_as_buffer */
  Py_TPFLAGS_DEFAULT,        /* tp_flags */
  "Lda object",          /* tp_doc */
  0,                         /* tp_traverse */
  0,                         /* tp_clear */
  0,                         /* tp_richcompare */
  0,                         /* tp_weaklistoffset */
  0,                         /* tp_iter */
  0,                         /* tp_iternext */
  LdaType_methods,       /* tp_methods */
  0,                         /* tp_members */
  0,                         /* tp_getset */
  0,                         /* tp_base */
  0,                         /* tp_dict */
  0,                         /* tp_descr_get */
  0,                         /* tp_descr_set */
  0,                         /* tp_dictoffset */
  (initproc)Lda_init,    /* tp_init */
  0,                         /* tp_alloc */
  Lda_new,               /* tp_new */
};

// ------------------------------------- LdaGibbsInferencer ---------------------------------

typedef struct {
  PyObject_HEAD
  LdaGibbsInferencer *lda_inferencer_;
  Dictionary *dictionary_;
} LdaInferencerObject;

struct LdaInferencerType;

static int LdaInferencer_init(LdaInferencerObject *self, PyObject *args, PyObject *kwds)
{
  double alpha = 0;
  double beta = 0;
  long topic_number_long = 0;

  self->dictionary_ = NULL;
  self->lda_inferencer_ = NULL;


  if (!PyArg_ParseTuple(args, "Oldd", &dictionary_object, &topic_number_long, &alpha, &beta))
    goto failed_cleanup;

  self->dictionary_ = new Dictionary();
  PyObject *iterator = PyObject_GetIter(obj);
  if (iterator == NULL) goto failed_cleanup;

  PyObject *item;
  while (item = PyIter_Next(iterator)) {
    PyObject *term_utf8_bytes = PyUnicode_AsUTF8String(item);
    if (term_utf8_bytes == NULL) goto failed_cleanup;

    const char *term = PyBytes_AsString(term_utf8_bytes); 
    if (term == NULL) goto failed_cleanup;

    // insert item into dictionary
    self->dictionary_->GetTermId(term);
  }

  self->lda_inferencer_ = new LdaGibbsInferencer(self->dictionary_, 
                                                 static_cast<TopicId>(topic_number_long), 
                                                 alpha, 
                                                 beta);

  return 0;

failed_cleanup:

  if (self->dictionary_ != NULL) delete self->dictionary_;
  if (self->lda_inferencer_ != NULL) delete self->lda_inferencer_;
  return 1;
}

static PyObject *LdaInferencer_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
  return (PyObject *)((LdaInferencerType *)type->tp_alloc(type, 0));
}


static int LdaInferencer_dealloc(LdaInferencerObject *self)
{
  delete self->lda_inferencer_;
  delete self->dictionary_;
  PyObject_Del(self);
  return 0;
}

static PyObject *LdaInferencer_inference(LdaInferencerObject* self, PyObject *args)
{
  long iteration_long = 0;
  PyObject *term_list_object ＝ NULL;
  TermId *document_content = NULL;
  PyObject *term_str_object = NULL;
  PyObject *term_utf8_bytes = NULL;
  double *topic_distribution = NULL;

  if (!PyArg_ParseTuple(args, "Ol", &term_list_object, &iteration_long))
    return NULL;

  if (PyList_Check(term_list_object) == 0)
    return NULL;

  // Get document term strings from parameter
  Py_ssize_t document_lehgth = PyList_Size(term_list_object);
  document_content = new TermId[document_size];
  for (Py_ssize_t term_index = 0; term_index < document_lehgth; ++term_index) {
    term_str_object = PyList_GetItem(term_list_object, term_index);
    
    term_utf8_bytes = PyUnicode_AsUTF8String(term_unicode);
    if (term_utf8_bytes == NULL) goto inference_failed_cleanup;

    const char *term_str = PyBytes_AsString(term_utf8_bytes);
    if (term_str == NULL) goto inference_failed_cleanup;

    document_content[term_index] = self->dictionary_->FindIdByTerm(term_str);

    Py_DECREF(term_utf8_bytes);
    term_utf8_bytes = NULL;
  }

  double *topic_distribution = new double[self->lda_inferencer_->topic_number()];

  // start inference
  self->lda_inferencer_->Inference(200, document_length, document_content, topic_distribution);


  self->lda_model_->GibbsSampling(static_cast<int>(iteration_long));
  Py_RETURN_NONE;

inference_failed_cleanup:
  if (document_content != NULL) delete[] document_content;
  if (term_utf8_bytes != NULL) Py_DECREF(term_utf8_bytes);
  if (topic_distribution != NULL) delete[] topic_distribution;
  return NULL;

}

static PyObject *Lda_calculate_phi(LdaObject* self, PyObject *args)
{
  long topic_id_long = 0;
  long term_id_long = 0;

  if (!PyArg_ParseTuple(args, "ll", &topic_id_long, &term_id_long))
    return NULL;  

  TopicId topic_id = static_cast<TopicId>(topic_id_long);
  TermId term_id = static_cast<TermId>(term_id_long);

  double phi = self->lda_model_->CalculatePhi(topic_id, term_id);

  return PyFloat_FromDouble(phi);
}

static PyObject *Lda_calculate_theta(LdaObject* self, PyObject *args)
{
  long document_id_long = 0;
  long topic_id_long = 0;

  if (!PyArg_ParseTuple(args, "ll", &document_id_long, &topic_id_long))
    return NULL;  

  DocumentId document_id = static_cast<DocumentId>(document_id_long);
  TopicId topic_id = static_cast<TopicId>(topic_id_long);

  double theta = self->lda_model_->CalculateTheta(document_id, topic_id);

  return PyFloat_FromDouble(theta);
}

static PyMethodDef LdaType_methods[] = {
  {
    "gibbsSampling", 
    (PyCFunction)Lda_gibbs_sampling, 
    METH_VARARGS,
    "Start gibbs sampling."
  }, {
    "calculatePhi", 
    (PyCFunction)Lda_calculate_phi, 
    METH_VARARGS,
    "Get the value of phi."
  }, {
    "calculateTheta", 
    (PyCFunction)Lda_calculate_theta, 
    METH_VARARGS,
    "Get the value of theta."
  }, {
      NULL
  }  /* Sentinel */
};

static PyTypeObject LdaType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "lda.Lda",             /* tp_name */
  sizeof(LdaObject),     /* tp_basicsize */
  0,                         /* tp_itemsize */
  (destructor)Lda_dealloc,    /* tp_dealloc */
  0,                         /* tp_print */
  0,                         /* tp_getattr */
  0,                         /* tp_setattr */
  0,                         /* tp_reserved */
  0,                         /* tp_repr */
  0,                         /* tp_as_number */
  0,                         /* tp_as_sequence */
  0,                         /* tp_as_mapping */
  0,                         /* tp_hash  */
  0,                         /* tp_call */
  0,                         /* tp_str */
  0,                         /* tp_getattro */
  0,                         /* tp_setattro */
  0,                         /* tp_as_buffer */
  Py_TPFLAGS_DEFAULT,        /* tp_flags */
  "Lda object",          /* tp_doc */
  0,                         /* tp_traverse */
  0,                         /* tp_clear */
  0,                         /* tp_richcompare */
  0,                         /* tp_weaklistoffset */
  0,                         /* tp_iter */
  0,                         /* tp_iternext */
  LdaType_methods,       /* tp_methods */
  0,                         /* tp_members */
  0,                         /* tp_getset */
  0,                         /* tp_base */
  0,                         /* tp_dict */
  0,                         /* tp_descr_get */
  0,                         /* tp_descr_set */
  0,                         /* tp_dictoffset */
  (initproc)Lda_init,    /* tp_init */
  0,                         /* tp_alloc */
  Lda_new,               /* tp_new */
};

// ------------------------------------- Python Module --------------------------------------

static PyModuleDef PyLda_module = {
    PyModuleDef_HEAD_INIT,
    "lda",   /* name of module */
    NULL, /* module documentation, may be NULL */
    -1,       /* size of per-interpreter state of the module,
                or -1 if the module keeps state in global variables. */
    NULL, NULL, NULL, NULL, NULL
};

PyMODINIT_FUNC
PyInit_lda(void)
{
    PyObject* m;

    if (PyType_Ready(&DataSetType) < 0)
        return NULL;

    if (PyType_Ready(&LdaType) < 0)
        return NULL;

    m = PyModule_Create(&PyLda_module);
    if (m == NULL)
        return NULL;
    
    Py_INCREF(&DataSetType);
    PyModule_AddObject(m, "DataSet", (PyObject *)&DataSetType);
    Py_INCREF(&LdaType);
    PyModule_AddObject(m, "Lda", (PyObject *)&LdaType);

    return m;
}

from distutils.core import setup, Extension

module1 = Extension('lda',
                    sources = ['pylda.cc', 'dataset.cc', 'dictionary.cc', 'lda_gibbs_model.cc'],)

setup (name = 'PackageName',
       version = '1.0',
       description = 'This is a demo package',
       ext_modules = [module1])

import lda
import pickle

dataset = lda.DataSet()
stopwords = None
with open('stopwords.txt', encoding = 'utf-8') as fp:
    stopwords = set(map(lambda x: x.strip(), fp))


print("Loading data ...")
for line in open('f_sp.txt', encoding = 'utf-8'):
    line = line.strip()
    terms = line.split()
    terms = list(filter(lambda x: len(x) > 1 and x not in stopwords, terms))
    if len(terms) == 0:
        continue
    dataset.appendDocument(terms[0], terms)


print("Total documents: {0}\nTotal terms: {1}".format(dataset.documentNumber(), dataset.termNumber()))

topic_number = 100
term_number = dataset.termNumber()
document_number = dataset.documentNumber()

lda = lda.Lda(dataset, topic_number, 50 / topic_number, 0.01)
print("Start sampling")
iter = 0;
for i in range(10):
    for j in range(10):
        iter += 1
        print("Iteration {0}".format(iter))
        lda.gibbsSampling(1)

    for topic_id in range(topic_number):
        phi_topic = map(lambda term_id: lda.calculatePhi(topic_id, term_id), range(term_number))
        term_weight_pairs = list(zip(
            range(term_number),
            phi_topic))
        
        term_weight_pairs.sort(key = lambda x: x[1], reverse = True)
        print("Topic {0}: {1}".format(topic_id, ', '.join(
            map(lambda x: dataset.getTermById(x[0]), term_weight_pairs[:20]))))

    with open('phi.matrix', 'w', encoding = 'utf-8') as fp:
        for term_id in range(term_number):
            phi_term = map(lambda topic_id: '{:.15f}'.format(lda.calculatePhi(topic_id, term_id)), range(topic_number))
            term_str = dataset.getTermById(term_id)
            fp.write('{0} {1}\n'.format(term_str, ' '.join(phi_term)))

    with open('theta.matrix', 'w', encoding = 'utf-8') as fp:
        for document_id in range(document_number):
            theta_document = map(lambda topic_id: '{:.15f}'.format(lda.calculateTheta(document_id, topic_id)), range(topic_number))
            document_title = dataset.documentTitle(document_id)
            fp.write('{0} {1}\n'.format(document_title, ' '.join(theta_document)))

    print('OK')

你可能感兴趣的:(LDA,文本聚类)

潜在狄利克雷分配（Latent Dirichlet Allocation,LDA）—无监督学习方法、概率模型、生成模型、线性模型、非参数化模型、贝叶斯学习、批量学习剑海风云 Artificial Intelligence 人工智能机器学习潜在狄利克雷分配 LDA
定义输入:单词集合W={ω1,⋯ ,ωv,⋯ ,ωV},其中ωv是第v个单词,v=1,2,⋯ ,V,V是单词第个数。单词集合W=\{\omega_1,\cdots,\omega_v,\cdots,\omega_V\},其中\omega_v是第v个单词,v=1,2,\cdots,V,V是单词第个数。单词集合W={ω1,⋯,ωv,⋯,ωV},其中ωv是第v个单词,v=1,2,⋯,V,V是单词第个数。文
线性判别分析 (Linear Discriminant Analysis, LDA) ALGORITHM LOL 人工智能机器学习算法
线性判别分析(LinearDiscriminantAnalysis,LDA)通俗易懂算法线性判别分析（LinearDiscriminantAnalysis，LDA）是一种用于分类和降维的技术。其主要目的是找到一个线性变换，将数据投影到一个低维空间，使得在这个新空间中，不同类别的数据能够更好地分离。线性判别分析的核心思想LDA的基本思路是最大化类间方差（between-classvariance）与
机器学习实战笔记5——线性判别分析绍少阿机器学习笔记可视化机器学习 python 人工智能
任务安排1、机器学习导论8、核方法2、KNN及其实现9、稀疏表示3、K-means聚类10、高斯混合模型4、主成分分析11、嵌入学习5、线性判别分析12、强化学习6、贝叶斯方法13、PageRank7、逻辑回归14、深度学习线性判别分析（LDA）Ⅰ核心思想对于同样一件事，站在不同的角度，我们往往会有不同的看法，而降维思想，亦是如此。同上节课一样，我们还是学习降维的算法，只是提供了一种新的角度，由上
自然语言处理系列五十四》文本聚类算法》K-means文本聚类算法原理陈敬雷-充电了么-CEO兼CTO 算法大数据人工智能自然语言处理 nlp ai 人工智能 kmeans AIGC 聚类
注：此文章内容均节选自充电了么创始人，CEO兼CTO陈敬雷老师的新书《自然语言处理原理与实战》（人工智能科学与技术丛书）【陈敬雷编著】【清华大学出版社】文章目录自然语言处理系列五十四文本聚类算法》K-means文本聚类算法原理K-means文本聚类算法代码实战总结自然语言处理系列五十四文本聚类算法》K-means文本聚类算法原理K-means文本聚类是K-means算法的一个常用应用场景，下面介绍
自然语言处理系列五十五》文本聚类算法》LDA主题词-潜在狄利克雷分布模型算法原理陈敬雷-充电了么-CEO兼CTO 人工智能大数据算法算法自然语言处理聚类 AIGC aigc chatgpt 大数据
注：此文章内容均节选自充电了么创始人，CEO兼CTO陈敬雷老师的新书《自然语言处理原理与实战》（人工智能科学与技术丛书）【陈敬雷编著】【清华大学出版社】文章目录自然语言处理系列五十五文本聚类算法》LDA主题词-潜在狄利克雷分布模型算法原理LDA主题词-潜在狄利克雷分布模型代码实战总结自然语言处理系列五十五文本聚类算法》LDA主题词-潜在狄利克雷分布模型算法原理LDA是潜在狄利克雷分布模型的简称，也
基于 LDA SS-NMF 的文本主题分析可视化分析系统毕业设计附完整代码程序员奇奇计算机毕设课程设计 python 人工智能 LDA 主题分析
摘要在机器学习和自然语言处理领域中，主题模型(TopicModel)是在一系列文档中发现抽象主题的一种统计模型，并被广泛地应用于文本文档集合的分析。近年来，各种主题建模技术，特别是概率图建模技术，取得了显著的进展，其中隐含狄利克雷分布(LDA)等最先进的技术已经成功地应用于可视化文本分析。然而，大多数基于概率模型的方法在多次运行的一致性和经验收敛性方面存在缺陷。此外，由于公式和算法的复杂性，LDA
深入理解LDA主题模型及其在文本分析中的应用小高要坚强 python 信息可视化 matplotlib 算法分类
深入理解LDA主题模型及其在文本分析中的应用在自然语言处理领域，主题模型是一种强大的工具，能够自动发现文档集中的潜在主题。在大规模文本数据分析中，LatentDirichletAllocation(LDA)是最受欢迎的主题模型之一。LDA的核心目标是从文档集中提取不同的主题，并确定每篇文档属于这些主题的概率分布。本文将详细介绍LDA主题模型的原理、如何使用Python实现LDA，并演示如何将其应用
NLP关键词提取:TF/IDF、TextRank、LSI和LDA分析 Chelseady NLP 机器学习
一.原理部分1.TF/IDF原理https://blog.csdn.net/asialee_bird/article/details/814867002.TextRank原理https://blog.csdn.net/qq_41664845/article/details/828695963.LSI原理https://blog.csdn.net/qq_16633405/article/detail
OpenLDAP接入NineData SSO NineData 云数据库技术 SQL 开发数据库数据库开发 sql mysql 云计算安全 dba
本文面向使用OpenLDAP管理人员账户信息的企业，提供将OpenLDAP接入单点登录（SSO）的最佳实践指南，以实现统一认证和授权管理。通过集成OpenLDAP、phpLDAPadmin、Keycloak，您可以轻松通过SSO功能登录NineData。1.背景信息OpenLDAP（OpenLightweightDirectoryAccessProtocol）是一个开源的轻量级目录访问协议（LDA
SPSSAU【文本分析】|LDA主题分析 spssau 人工智能文本分析文本挖掘
LDA主题分析LDA主题分析是一种提取出文本数据核心主题的模型，其可将整份数据文档的信息提取成几个主题，并且标题出主题与关键词之间的权重情况，用于识别主题的具体实际意义，除此之外，LDA主题分析涉及到可视化展示和图形交互等，接下来将具体进行说明。进行LDA主题分析时，首先需要确定主题个数（理论上有确定主题个数的方式，但实际研究分析时，通常是研究者结合实际意义情况来确定主题个数，通常主题个数介于2~
SPSSAU【文本分析】|文本聚类 spssau 支持向量机机器学习人工智能
SPSSAU共提供两种文本聚类方式，分别是按词聚类和按行聚类。按词聚类是指将需要分析的关键词进行聚类分析，并且进行可视化展示，即针对关键词进行聚类，此处关键词可以自由选择。按行聚类分析是指针对以‘行’为单位进行聚类分析，将原始文本中多行数据聚为几个类别，并且可将具体聚类类别信息进行下载等。按词聚类分析按词聚类分析操作如下图：默认情况下，系统会将词频靠前的20个关键词提取，并且得到其词向量值，并且其
机器学习中的10种非线性降维技术对比总结
降维意味着我们在不丢失太多信息的情况下减少数据集中的特征数量，降维算法属于无监督学习的范畴，用未标记的数据训练算法。尽管降维方法种类繁多，但它们都可以归为两大类:线性和非线性。线性方法将数据从高维空间线性投影到低维空间(因此称为线性投影)。例子包括PCA和LDA。非线性方法提供了一种执行非线性降维(NLDR)的方法。我们经常使用NLDR来发现原始数据的非线性结构。当原始数据不可线性分离时，NLDR
vgg19-dcbb9e9d.pth文件网盘下载 Sherry_Yue pytorch VGG
VGG19pth文件网盘链接官网下载地址（特别慢）：https://download.pytorch.org/models/vgg19-dcbb9e9d.pth网盘下载地址：链接:https://pan.baidu.com/s/1Z0H1E9vv3aL5u4BHUw5LdA提取码:bwma
机器学习LDA线性判别器代码实现 Longlongaaago 机器学习 LDA 线性判别分析代码实现
机器学习LDA线性判别器代码实现西瓜书P60线性判别器LDA代码实现：importnumpyasnpimportmatplotlib.pyplotaspltdefload_data(file_name):'''数据导入函数:paramfile_name:(string)训练数据位置:return:feature_data(mat)特征lable_data(mat)标签'''fr=open(file
基于python旅游景点评论数据分析系统+可视化+LDA主题分析+NLP情感分析+Bayes评论分类计算机毕业设计✅ 源码之家 biyesheji0001 biyesheji0002 毕业设计 python 自然语言处理分类毕业设计 LDA nlp 评论数据
博主介绍：✌全网粉丝10W+,前互联网大厂软件研发、集结硕博英豪成立工作室。专注于计算机相关专业毕业设计项目实战6年之久，选择我们就是选择放心、选择安心毕业✌感兴趣的可以先收藏起来，点赞、关注不迷路✌毕业设计：2023-2024年计算机毕业设计1000套（建议收藏）毕业设计：2023-2024年最新最全计算机专业毕业设计选题汇总1、项目介绍项目技术说明：python语言、Flask框架、MySQL
数据处理方法—— 7 种数据降维操作！！ JOYCE_Leo16 Python 数据降维 python 数据处理
文章目录数据降维1.主成分分析（PCA）2.线性判别分析（LDA）3.t-分布随机邻域嵌入（t-SNE）4.局部线性嵌入（LLE)5.多维缩放（MDS)6.奇异值分解（SVD)7.自动编码器（Autoencoders)总结数据降维数据降维是一种将高维数据转换为低纬数据的技术，同时尽量保留原始数据的重要信息。这对于处理大规模数据集非常有用，因为它有助于减少计算资源的需要，并提高算法的效率。以下是一些
Gensim详细介绍和使用：一个Python文本建模库 Bigcrab__ Python 库介绍和使用 python
Gensim=“GenerateSimilar”一、安装二、文本预处理2.1中文语料处理2.2英文语料处理2.3BOW语料建立三、模型使用3.1word2vecThealgorithmsinGensim,suchasWord2Vec,FastText,LatentSemanticIndexing(LSI,LSA,LsiModel),LatentDirichletAllocation(LDA,Lda
大数据可视化/算法推荐/情感分析——基于Django电影评论数据可视化分析推荐系统（完整系统源码+数据库+详细文档+论文+部署教程）谁不学习揍谁！大数据可视化毕业设计信息可视化算法 django 机器学习数据库 python 前端
文章目录大数据可视化/算法推荐/情感分析——基于Django电影评论数据情感分析可视化分析推荐系统源码资料获取方式在文章末尾一、选题背景二、研究目的三、开发技术介绍1、Django框架2、LDA3、机器学习推荐算法4、大数据爬虫5、大数据Echarts可视化四、系统设计思想五、部分代码讲解六、系统实现七、源码资料获取（完整系统源码+数据库+详细文档+论文+部署教程）大数据可视化/算法推荐/情感分析
【机器学习与自然语言处理】预训练 Pre-Training 各种经典方法的概念汇总溢流眼泪【科研】机器学习自然语言处理人工智能
【NLP概念合集：一】预训练Pre-Training，微调Fine-Tuning及其方法的概念区别前言请看此正文预训练Pre-Training无监督学习unsupervisedlearning概念：标签PCA主成分分析（PrincipalComponentAnalysis）降维算法LSA潜在语义分析（LatentSemanticAnalysis）降维算法LDA隐含狄利克雷分布（LatentDiri
西瓜书学习笔记——核化线性降维（公式推导+举例应用） Nie同学机器学习学习笔记机器学习
文章目录算法介绍实验分析算法介绍核化线性降维是一种使用核方法（KernelMethods）来进行降维的技术。在传统的线性降维方法中，例如主成分分析（PCA）和线性判别分析（LDA），数据被映射到一个低维线性子空间中。而核化线性降维则通过使用核技巧，将数据映射到一个非线性的低维空间中。核技巧的核心思想是通过一个非线性映射将原始数据转换到一个高维的特征空间，然后在该特征空间中应用线性降维方法。这种映射
白铁时代 —— （监督学习）原理推导人生简洁之道 2020年 -面试笔记人工智能
来自李航《统计学习方法》文章目录-1指标相似度0概论1优化类1.1朴素贝叶斯1.2k近邻-kNN1.3线性判别分析二分类LDA多分类LDA流程LDA和PCA的区别和联系1.4逻辑回归模型&最大熵模型逻辑回归最大熵模型最优化1.5感知机&SVM感知机SVM线性可分SVM线性不可分SVM对偶优化问题&非线性SVM序列最小优化算法SMO1.7概率图模型EM算法EM算法的导出和流程应用举例：高斯混合模型(
机器学习：线性判别分析LDA（Python）捕捉一只Diu 机器学习算法线性回归笔记 python
一、线性判别分析的定义二、线性判别分析——二分类模型lda2classify.pyimportnumpyasnpclassLDABinaryClassifier:"""线性判别分析二分类模型"""def__init__(self):self.mu=None#各类别均值向量self.Sw_i=None#各类内散度矩阵self.Sw=None#类内散度矩阵(within-classscattermat
中国文化之光：微博数据的探索与可视化分析八块腹肌的小胖 python 数据可视化数据挖掘
大家好，我是八块腹肌的小胖下面我们针对主题“中国文化”相关的微博数据进行爬取使用LDA、情感分析、情感演化、词云等可视化操作进行相关的展示1、导包第一步我们开始导包工作下面这段代码，首先，pandas被请来了，因为它是处理数据的高手，能把数据弄得井井有条。然后，gensim也加入了，它擅长于自然语言处理，就像是让数据说话的魔术师。接着，咱们用了simple_preprocess，这个就像是个文本切
C languange DGEQRF 示例，link liblapack.a Eloudy c语言算法开发语言
1.示例源码#includeintmin(intm,intn){returnm
新媒体与传媒行业数据分析实践：从网络爬虫到文本挖掘的综合应用，以“中国文化“为主题八块腹肌的小胖数据分析 python
大家好，我是八块腹肌的小胖，下面将围绕微博“中国文化”以数据分析、数据处理、建模及可视化等操作目录1、数据获取2、数据处理3、词频统计及词云展示4、文本聚类分析5、文本情感倾向性分析6、情感倾向演化分析7、总结1、数据获取本任务以新浪微博为目标网站，爬取“中国文化”为主题的微博数据进行数据预处理、数据可视化等操作。目标网站如图1所示：图1微博网站及分析通过分析微博网站，使用爬虫获取代码，爬虫核心伪
基于大数据的B站数据分析系统的设计与实现叫我：松哥大数据信息可视化数据分析 python 数据挖掘网络爬虫
摘要：随着B站（哔哩哔哩网）在国内视频分享平台的崛起，用户规模和数据量不断增加。为了更好地理解和利用这些海量的B站数据，设计并实现了一套基于Python的B站数据分析系统。该系统采用了layui作为前端框架、Flask作为后端框架，以及Echarts作为可视化工具，数据库选择MySQL，使用gensim库进行LDA主题建模。在系统设计方面，前端使用layui框架进行开发，提供了用户友好的界面，支持
Fisher线性判别分析 Sanchez·J 美赛算法机器学习人工智能
Fisher线性判别分析原理LDA(LinearDiscriminantAnalysis）是一种经典的线性判别方法，又称Fisher判别分析。该方法思想比较简单：给定训练集样例，设法将样例投影到一维的直线上，使得同类样例的投影点尽可能接近和密集，异类投影点尽可能远离。Fisher线性判别分析主要包括两个目标：最大化类间方差（MaximizeBetween-ClassVariance）：通过找到一个
网络信息检索（九）文本分类与文本聚类 Ordinary_yfz 网络信息检索
文章目录一、文本分类和聚类概述1：文本分类概述2：文本聚类概述二、文本分类1：分类的学习算法2：使用相关反馈（Rocchio）3：最近邻学习算法4：贝叶斯理论三、文本聚类1：K-Means一、文本分类和聚类概述1：文本分类概述文本分类的定义\color{red}\textbf{文本分类的定义}文本分类的定义文本分类（TextCategorization/Classification）：事先给定分类
RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasGemmEx报错 sh1186 linux
解决方案：unsetLD_LIBRARY_PATH之前还出现错误：RuntimeError:CUDAerror:CUBLAS_STATUS_INVALID_VALUEwhencalling`cublasSgemm(handle,opa,opb,m,n,k,&alpha,a,lda,b,ldb,&beta,c,ldc)`在进行上述解决方案后，也没有报这个错误，两者应该是同一个原因造成的
CUBLAS_STATUS_EXECUTION_FAILED when calling cublasSgemm 解决方法 han_mj pytorch 深度学习 pytorch 神经网络
问题描述在运行pytorch项目的时候遇到了这个问题RuntimeError:CUDAerror:CUBLAS_STATUS_EXECUTION_FAILEDwhencallingcublasSgemm(handle,opa,opb,m,n,k,&alpha,a,lda,b,ldb,&beta,c,ldc)简单描述一下我遇到的情况：在运行Bert的SelfAttention操作时出错，定位在一个n
Js函数返回值 _wy_ js return
一、返回控制与函数结果，语法为：return 表达式;作用: 结束函数执行，返回调用函数，而且把表达式的值作为函数的结果二、返回控制语法为：return;作用: 结束函数执行，返回调用函数，而且把undefined作为函数的结果在大多数情况下,为事件处理函数返回false,可以防止默认的事件行为.例如,默认情况下点击一个<a>元素,页面会跳转到该元素href属性
MySQL 的 char 与 varchar bylijinnan mysql
今天发现，create table 时，MySQL 4.1有时会把 char 自动转换成 varchar 测试举例： CREATE TABLE `varcharLessThan4` ( `lastName` varchar(3) ) ; mysql> desc varcharLessThan4; +----------+---------+------+-
Quartz——TriggerListener和JobListener eksliang TriggerListener JobListener quartz
转载请出自出处：http://eksliang.iteye.com/blog/2208624 一.概述 listener是一个监听器对象，用于监听scheduler中发生的事件，然后执行相应的操作；你可能已经猜到了，TriggerListeners接受与trigger相关的事件，JobListeners接受与jobs相关的事件。二.JobListener监听器 j
oracle层次查询 18289753290 oracle；层次查询；树查询
.oracle层次查询(connect by) oracle的emp表中包含了一列mgr指出谁是雇员的经理，由于经理也是雇员，所以经理的信息也存储在emp表中。这样emp表就是一个自引用表，表中的mgr列是一个自引用列，它指向emp表中的empno列，mgr表示一个员工的管理者， select empno,mgr,ename,sal from e
通过反射把map中的属性赋值到实体类bean对象中酷的飞上天空 javaee 泛型类型转换
使用过struts2后感觉最方便的就是这个框架能自动把表单的参数赋值到action里面的对象中但现在主要使用Spring框架的MVC，虽然也有@ModelAttribute可以使用但是明显感觉不方便。好吧，那就自己再造一个轮子吧。原理都知道，就是利用反射进行字段的赋值，下面贴代码主要类如下： import java.lang.reflect.Field; imp
SAP HANA数据存储：传统硬盘的瓶颈问题蓝儿唯美 HANA
SAPHANA平台有各种各样的应用场景，这也意味着客户的实施方法有许多种选择，关键是如何挑选最适合他们需求的实施方案。在《Implementing SAP HANA》这本书中，介绍了SAP平台在现实场景中的运作原理，并给出了实施建议和成功案例供参考。本系列文章节选自《Implementing SAP HANA》，介绍了行存储和列存储的各自特点，以及SAP HANA的数据存储方式如何提升空间压
Java Socket 多线程实现文件传输随便小屋 java socket
高级操作系统作业，让用Socket实现文件传输，有些代码也是在网上找的，写的不好，如果大家能用就用上。客户端类： package edu.logic.client; import java.io.BufferedInputStream; import java.io.Buffered
java初学者路径 aijuans java
学习Java有没有什么捷径?要想学好Java，首先要知道Java的大致分类。自从Sun推出Java以来，就力图使之无所不包，所以Java发展到现在，按应用来分主要分为三大块：J2SE,J2ME和J2EE,这也就是Sun ONE(Open Net Environment)体系。J2SE就是Java2的标准版，主要用于桌面应用软件的编程；J2ME主要应用于嵌入是系统开发，如手机和PDA的编程；J2EE
APP推广 aoyouzi APP 推广
一，免费篇 1，APP推荐类网站自主推荐最美应用、酷安网、DEMO8、木蚂蚁发现频道等,如果产品独特新颖，还能获取最美应用的评测推荐。PS：推荐简单。只要产品有趣好玩，用户会自主分享传播。例如足迹APP在最美应用推荐一次，几天用户暴增将服务器击垮。 2，各大应用商店首发合作老实盯着排期，多给应用市场官方负责人献殷勤。 3，论坛贴吧推广百度知道，百度贴吧，猫扑论坛，天涯社区，豆瓣（
JSP转发与重定向百合不是茶 jsp servlet Java Web jsp转发
在servlet和jsp中我们经常需要请求,这时就需要用到转发和重定向; 转发包括;forward和include 例子;forwrad转发; 将请求装法给reg.html页面关键代码; req.getRequestDispatcher("reg.html
web.xml之jsp-config bijian1013 java web.xml servlet jsp-config
1.作用：主要用于设定JSP页面的相关配置。 2.常见定义： <jsp-config> <taglib> <taglib-uri>URI(定义TLD文件的URI,JSP页面的tablib命令可以经由此URI获取到TLD文件)</tablib-uri> <taglib-location> TLD文件所在的位置
JSF2.2 ViewScoped Using CDI sunjing CDI JSF 2.2 ViewScoped
JSF 2.0 introduced annotation @ViewScoped; A bean annotated with this scope maintained its state as long as the user stays on the same view(reloads or navigation - no intervening views). One problem w
【分布式数据一致性二】Zookeeper数据读写一致性 bit1129 zookeeper
很多文档说Zookeeper是强一致性保证，事实不然。关于一致性模型请参考http://bit1129.iteye.com/blog/2155336 Zookeeper的数据同步协议 Zookeeper采用称为Quorum Based Protocol的数据同步协议。假如Zookeeper集群有N台Zookeeper服务器(N通常取奇数，3台能够满足数据可靠性同时
Java开发笔记白糖_ java开发
1、Map<key,value>的remove方法只能识别相同类型的key值 Map<Integer,String> map = new HashMap<Integer,String>(); map.put(1,"a"); map.put(2,"b"); map.put(3,"c"
图片黑色阴影 bozch 图片
.event{ padding:0; width:460px; min-width: 460px; border:0px solid #e4e4e4; height: 350px; min-heig
编程之美-饮料供货-动态规划 bylijinnan 动态规划
import java.util.Arrays; import java.util.Random; public class BeverageSupply { /** * 编程之美饮料供货 * 设Opt（V’，i）表示从i到n-1种饮料中，总容量为V’的方案中，满意度之和的最大值。 * 那么递归式就应该是：Opt（V’，i）=max{ k * Hi+Op
ajax大参数（大数据）提交性能分析 chenbowen00 Web Ajax 框架浏览器 prototype
近期在项目中发现如下一个问题项目中有个提交现场事件的功能，该功能主要是在web客户端保存现场数据（主要有截屏，终端日志等信息）然后提交到服务器上方便我们分析定位问题。客户在使用该功能的过程中反应点击提交后反应很慢，大概要等10到20秒的时间浏览器才能操作，期间页面不响应事件。根据客户描述分析了下的代码流程，很简单，主要通过OCX控件截屏，在将前端的日志等文件使用OCX控件打包，在将之转换为
[宇宙与天文]在太空采矿,在太空建造 comsci
我们在太空进行工业活动...但是不太可能把太空工业产品又运回到地面上进行加工,而一般是在哪里开采,就在哪里加工,太空的微重力环境,可能会使我们的工业产品的制造尺度非常巨大.... 地球上制造的最大工业机器是超级油轮和航空母舰,再大些就会遇到困难了,但是在空间船坞中,制造的最大工业机器,可能就没
ORACLE中CONSTRAINT的四对属性 daizj oracle CONSTRAINT
ORACLE中CONSTRAINT的四对属性 summary:在data migrate时,某些表的约束总是困扰着我们,让我们的migratet举步维艰,如何利用约束本身的属性来处理这些问题呢?本文详细介绍了约束的四对属性: Deferrable/not deferrable, Deferred/immediate, enalbe/disable, validate/novalidate,以及如
Gradle入门教程 dengkane gradle
一、寻找gradle的历程一开始的时候，我们只有一个工程，所有要用到的jar包都放到工程目录下面，时间长了，工程越来越大，使用到的jar包也越来越多，难以理解jar之间的依赖关系。再后来我们把旧的工程拆分到不同的工程里，靠ide来管理工程之间的依赖关系，各工程下的jar包依赖是杂乱的。一段时间后，我们发现用ide来管理项程很不方便，比如不方便脱离ide自动构建，于是我们写自己的ant脚本。再后
C语言简单循环示例 dcj3sjt126com c
# include <stdio.h> int main(void) { int i; int count = 0; int sum = 0; float avg; for (i=1; i<=100; i++) { if (i%2==0) { count++; sum += i; } } avg
presentModalViewController 的动画效果 dcj3sjt126com controller
系统自带(四种效果)： presentModalViewController模态的动画效果设置： [cpp] view plain copy UIViewController *detailViewController = [[UIViewController al
java 二分查找 shuizhaosi888 二分查找 java二分查找
需求：在排好顺序的一串数字中，找到数字T 一般解法：从左到右扫描数据，其运行花费线性时间O(N)。然而这个算法并没有用到该表已经排序的事实。 /** * * @param array * 顺序数组 * @param t * 要查找对象 * @return */ public stati
Spring Security（07）——缓存UserDetails 234390216 ehcache 缓存 Spring Security
Spring Security提供了一个实现了可以缓存UserDetails的UserDetailsService实现类，CachingUserDetailsService。该类的构造接收一个用于真正加载UserDetails的UserDetailsService实现类。当需要加载UserDetails时，其首先会从缓存中获取，如果缓存中没
Dozer 深层次复制 jayluns VO maven po
最近在做项目上遇到了一些小问题，因为架构在做设计的时候web前段展示用到了vo层，而在后台进行与数据库层操作的时候用到的是Po层。这样在业务层返回vo到控制层，每一次都需要从po-->转化到vo层，用到BeanUtils.copyProperties(source, target)只能复制简单的属性，因为实体类都配置了hibernate那些关联关系，所以它满足不了现在的需求，但后发现还有个很
CSS规范整理（摘自懒人图库） a409435341 html UI css 浏览器
刚没事闲着在网上瞎逛，找了一篇CSS规范整理，粗略看了一下后还蛮有一定的道理，并自问是否有这样的规范，这也是初入前端开发的人一个很好的规范吧。一、文件规范 1、文件均归档至约定的目录中。具体要求通过豆瓣的CSS规范进行讲解：所有的CSS分为两大类：通用类和业务类。通用的CSS文件，放在如下目录中：基本样式库 /css/core
C++动态链接库创建与使用你不认识的休道人 C++dll
一、创建动态链接库 1.新建工程test中选择”MFC [dll]”dll类型选择第二项"Regular DLL With MFC shared linked"，完成 2.在test.h中添加 extern “C” 返回类型 _declspec(dllexport)函数名(参数列表); 3.在test.cpp中最后写 extern “C” 返回类型 _decls
Android代码混淆之ProGuard rensanning ProGuard
Android应用的Java代码，通过反编译apk文件（dex2jar、apktool）很容易得到源代码，所以在release版本的apk中一定要混淆一下一些关键的Java源码。 ProGuard是一个开源的Java代码混淆器（obfuscation）。ADT r8开始它被默认集成到了Android SDK中。官网： http://proguard.sourceforge.net/
程序员在编程中遇到的奇葩弱智问题 tomcat_oracle jquery 编程 ide
　　现在收集一下：　　排名不分先后，按照发言顺序来的。 1、Jquery插件一个通用函数一直报错，尤其是很明显是存在的函数，很有可能就是你没有引入jquery。。。或者版本不对 2、调试半天没变化：不在同一个文件中调试。这个很可怕，我们很多时候会备份好几个项目，改完发现改错了。有个群友说的好：在汤匙
解决maven-dependency-plugin (goals "copy-dependencies","unpack") is not supported xp9802 dependency
解决办法：在plugins之前添加如下pluginManagement，二者前后顺序如下： [html] view plain copy <build> <pluginManagement