隐马可夫(HMM)中文分词词性标注程序
本隐马可夫(HMM)中文分词词性标注程序 中的 隐马可夫(HMM)概率模型 是由 PFR人民日报标注语料199801语料库 生成
public
class
HMM
{
static final String[] states = new String[52];
static final HashMap<String, Double> start_probability = new HashMap<String, Double>();
static final HashMap<String, HashMap<String, Double>> transition_probability = new HashMap<String, HashMap<String, Double>>();
static final HashMap<String, HashMap<String, Double>> emission_probability =new HashMap<String, HashMap<String, Double>>();
static
{
for(int i=0;i<52;i++)
states[i]=CountPOS.getPOSFromId(i);
InputStream is = Viterbi.class.getClassLoader().getResourceAsStream("startprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
int ss=0;
public void execute(String line) {
start_probability.put(states[ss], Double.parseDouble(line));
ss++;
}
});
is = Viterbi.class.getClassLoader().getResourceAsStream("tranprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
int ss=0;
public void execute(String line) {
HashMap<String, Double> t = new HashMap<String, Double>();
String[] cc=line.split("\t");
for(int j=0;j<cc.length;j++)
t.put(states[j], Double.parseDouble(cc[j]));
transition_probability.put(states[ss], t);
ss++;
}
});
is = Viterbi.class.getClassLoader().getResourceAsStream("emissionprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
public void execute(String line) {
String[] cc=line.split("\t");
String[] nn=cc[1].split(" ");
for(String n:nn)
{
HashMap<String, Double> e=null;
String[] bb=n.split(":");
if(emission_probability.containsKey(bb[0]))
e=emission_probability.get(bb[0]);
else
e=new HashMap<String, Double>();
e.put(cc[0], Double.parseDouble(bb[1]));
emission_probability.put(bb[0], e);
}
}
});
}
public static String[] tagging(String[] observations)
{
return forward_viterbi(observations,states,start_probability,transition_probability,emission_probability);
}
public static String[] forward_viterbi(String[] observations, String[] states,HashMap<String, Double> start_probability, HashMap<String, HashMap<String, Double>> transition_probability, HashMap<String, HashMap<String, Double>> emission_probability)
{
int[][] path=new int[observations.length][states.length];
double[][] r=new double[observations.length][states.length];
for(int j=0;j<states.length;j++)
{
if(emission_probability.get(states[j])!=null && emission_probability.get(states[j]).get(observations[0])!=null)
r[0][j]=start_probability.get(states[j])*emission_probability.get(states[j]).get(observations[0]);
path[0][j]=0;
}
for(int t=1;t<observations.length;t++)
{
for(int i=0;i<states.length;i++)
{
double tmp=0;int m=0;
for(int j=0;j<states.length;j++)
{
double tem=0;
if(emission_probability.get(states[i])!=null && emission_probability.get(states[i]).get(observations[t])!=null)
tem=r[t-1][j]*transition_probability.get(states[j]).get(states[i]) *emission_probability.get(states[i]).get(observations[t]);
if(tem>tmp)
{
tmp=tem;
m=j;
}
}
r[t][i]=tmp;
path[t][i]=m;
}
}
double p=0;int m=0;
for(int i=0;i<r[0].length;i++)
{
if(r[r.length-1][i]>p)
{
p=r[r.length-1][i];
m=i;
}
}
//System.out.println("p="+p);
int[] trace=new int[observations.length];
trace[observations.length-1]=m;
for(int t=observations.length-1;t>0;t--)
{
trace[t-1]=path[t][m];
m=path[t][m];
}
String[] ret=new String[observations.length];
for(int i=0;i<trace.length;i++)
ret[i]=states[trace[i]];
return ret;
}
public static void main(String[] args)
{
//String[] observations = new String[] {"这些","服务","实体","改","由","当地","有关","部门","管理"};
String[] observations = new String[] {"研究","生命","的","起源"};
String[] ret=tagging(observations);
for(String c:ret)
System.out.print(c+",");
}
}
{
static final String[] states = new String[52];
static final HashMap<String, Double> start_probability = new HashMap<String, Double>();
static final HashMap<String, HashMap<String, Double>> transition_probability = new HashMap<String, HashMap<String, Double>>();
static final HashMap<String, HashMap<String, Double>> emission_probability =new HashMap<String, HashMap<String, Double>>();
static
{
for(int i=0;i<52;i++)
states[i]=CountPOS.getPOSFromId(i);
InputStream is = Viterbi.class.getClassLoader().getResourceAsStream("startprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
int ss=0;
public void execute(String line) {
start_probability.put(states[ss], Double.parseDouble(line));
ss++;
}
});
is = Viterbi.class.getClassLoader().getResourceAsStream("tranprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
int ss=0;
public void execute(String line) {
HashMap<String, Double> t = new HashMap<String, Double>();
String[] cc=line.split("\t");
for(int j=0;j<cc.length;j++)
t.put(states[j], Double.parseDouble(cc[j]));
transition_probability.put(states[ss], t);
ss++;
}
});
is = Viterbi.class.getClassLoader().getResourceAsStream("emissionprob.txt");
FileUtil.readFileByLine(is, "UTF-8", new Callback(){
public void execute(String line) {
String[] cc=line.split("\t");
String[] nn=cc[1].split(" ");
for(String n:nn)
{
HashMap<String, Double> e=null;
String[] bb=n.split(":");
if(emission_probability.containsKey(bb[0]))
e=emission_probability.get(bb[0]);
else
e=new HashMap<String, Double>();
e.put(cc[0], Double.parseDouble(bb[1]));
emission_probability.put(bb[0], e);
}
}
});
}
public static String[] tagging(String[] observations)
{
return forward_viterbi(observations,states,start_probability,transition_probability,emission_probability);
}
public static String[] forward_viterbi(String[] observations, String[] states,HashMap<String, Double> start_probability, HashMap<String, HashMap<String, Double>> transition_probability, HashMap<String, HashMap<String, Double>> emission_probability)
{
int[][] path=new int[observations.length][states.length];
double[][] r=new double[observations.length][states.length];
for(int j=0;j<states.length;j++)
{
if(emission_probability.get(states[j])!=null && emission_probability.get(states[j]).get(observations[0])!=null)
r[0][j]=start_probability.get(states[j])*emission_probability.get(states[j]).get(observations[0]);
path[0][j]=0;
}
for(int t=1;t<observations.length;t++)
{
for(int i=0;i<states.length;i++)
{
double tmp=0;int m=0;
for(int j=0;j<states.length;j++)
{
double tem=0;
if(emission_probability.get(states[i])!=null && emission_probability.get(states[i]).get(observations[t])!=null)
tem=r[t-1][j]*transition_probability.get(states[j]).get(states[i]) *emission_probability.get(states[i]).get(observations[t]);
if(tem>tmp)
{
tmp=tem;
m=j;
}
}
r[t][i]=tmp;
path[t][i]=m;
}
}
double p=0;int m=0;
for(int i=0;i<r[0].length;i++)
{
if(r[r.length-1][i]>p)
{
p=r[r.length-1][i];
m=i;
}
}
//System.out.println("p="+p);
int[] trace=new int[observations.length];
trace[observations.length-1]=m;
for(int t=observations.length-1;t>0;t--)
{
trace[t-1]=path[t][m];
m=path[t][m];
}
String[] ret=new String[observations.length];
for(int i=0;i<trace.length;i++)
ret[i]=states[trace[i]];
return ret;
}
public static void main(String[] args)
{
//String[] observations = new String[] {"这些","服务","实体","改","由","当地","有关","部门","管理"};
String[] observations = new String[] {"研究","生命","的","起源"};
String[] ret=tagging(observations);
for(String c:ret)
System.out.print(c+",");
}
}