java大作业 KShinglingAlgorithm

wiki上关于KShingling Algorithm(w-shingling)的说明:

http://en.wikipedia.org/wiki/W-shingling

摘要:

In natural language processing a w-shingling is a set of unique "shingles"—contiguous subsequences of tokens in a document—that can be used to gauge the similarity of two documents. The w denotes the number of tokens in each shingle in the set.

The document, "a rose is a rose is a rose" can be tokenized as follows:

(a,rose,is,a,rose,is,a,rose)

The set of all contiguous sequences of 4 tokens (N-grams, here: 4-grams) is

{ (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is), (a,rose,is,a), (rose,is,a,rose) } = { (a,rose,is,a), (rose,is,a,rose), (is,a,rose,is) }

我理解的此算法,是把每段文本都像上述分解后,统计两段文本的合集b,再统计交集a,用a/b得到相似度。

写得有些复杂:

  1 package bigproject2;

  2 

  3 import javax.swing.JOptionPane;

  4 

  5 public class union {

  6     //求子集

  7     public String[] ziji(String str)

  8     {

  9         char[] ch=str.toCharArray();

 10         int c=0;

 11         for(int i=0;i<ch.length;i++)

 12         {

 13             if(ch[i]==' ')

 14                 c++;

 15         }

 16         //建立单词数组

 17         String[] strt=new String[c+1];

 18         for(int i=0;i<c+1;i++)

 19             strt[i]="";

 20         int h=0;

 21         for(int i=0;i<c+1;i++)

 22         {

 23             for(int j=h;j<ch.length;j++)

 24             {

 25                 if(ch[j]==' ')

 26                 {

 27                     h=j+1;

 28                     break;

 29                 }

 30                 else strt[i]+=ch[j];

 31             }

 32         }

 33         return strt;

 34     }

 35     //按k分,并去掉重复子集。

 36     public String[] cut(String[] str,int k) throws MyException{

 37         if(str.length<k)

 38                 throw new MyException("单词数少于"+k+",无法进行计算!");

 39         String[] t=new String[str.length-k+1];

 40         for(int i=0;i<str.length-k+1;i++)

 41             t[i]="";

 42         int h=0,m=0;

 43         for(;h<str.length-k+1;h++)

 44         {

 45             for(int i=m;i<m+k;i++)

 46                 t[h]+=str[i];

 47             m++;

 48         }

 49         //去掉重复部分

 50         int merge=0;

 51         for(int i=0;i<t.length-1;i++)

 52         {

 53             if(t[i].equals("")) break;

 54             for(int j=i+1;j<t.length;j++)

 55             {

 56                 if(t[i].equals(t[j]))

 57                 {

 58                     merge++;

 59                     int y=j;

 60                     for(;y<t.length-1;y++)

 61                     {

 62                         t[y]=t[y+1];

 63                     }

 64                     t[y]="";

 65                 }

 66             }

 67         }

 68         String[] fin=new String[t.length-merge];

 69         for(int i=0;i<t.length-merge;i++)

 70             fin[i]=t[i];

 71         return fin;

 72     }

 73     public class MyException extends Exception{

 74         public MyException(String str){

 75             JOptionPane.showMessageDialog(null, str,"警告", JOptionPane.INFORMATION_MESSAGE);

 76         }

 77     }

 78     //求两字符串数组合集个数。

 79     public int heji(String[] a,String[] b){

 80         int count=a.length+b.length;

 81         for(int i=0;i<a.length;i++)

 82         {

 83             for(int j=0;j<b.length;j++)

 84             {

 85                 if(a[i].equals(b[j]))

 86                     count--;

 87             }

 88         }

 89         return count;

 90     }

 91     //求两字符串数组交集个数。

 92     public int jiaoji(String[] a,String[] b){

 93         int count=0;

 94         for(int i=0;i<a.length;i++)

 95         {

 96             for(int j=0;j<b.length;j++)

 97             {

 98                 if(a[i].equals(b[j]))

 99                     count++;

100             }

101         }

102         return count;

103     }

104 

105 }

 

 1 package bigproject2;

 2 

 3 

 4 public class KShinglingAlgorithm extends union{

 5     private String text1,text2;

 6     public String getText1()

 7     {

 8         return text1;

 9     }

10     public String getText2()

11     {

12         return text2;

13     }

14     public void setText1(String text1)

15     {

16         this.text1=text1;

17     }

18     public void setText2(String text2)

19     {

20         this.text2=text2;

21     }

22     

23     public float getSimilarity(int k)

24     {

25        union a=new union();

26        String[] t1=a.ziji(this.text1);

27        String[] t2=a.ziji(this.text2);

28        String[] t1t,t2t;

29        try{

30            t1t=a.cut(t1, k);

31            t2t=a.cut(t2, k);

32            

33        }catch(MyException e){

34                return -1;

35        }

36        int he=a.heji(t1t, t2t);

37        int jiao=a.jiaoji(t1t, t2t);

38        return (float)jiao/he;

39     }

40 

41 }

 

 

面板设计部分:

  1 package bigproject2;

  2 import java.awt.*;

  3 import java.awt.event.*;

  4 import java.io.BufferedReader;

  5 import java.io.File;

  6 import java.io.FileNotFoundException;

  7 import java.io.FileReader;

  8 import java.io.IOException;

  9 import java.io.InputStreamReader;

 10 

 11 import javax.swing.*;

 12 import javax.swing.event.*;

 13 import javax.swing.filechooser.FileNameExtensionFilter;

 14 

 15 public class Outlook extends JFrame{

 16     JFrame frm=new JFrame("相似度计算器");

 17     JPanel areabottom=new JPanel();

 18     JPanel areatop=new JPanel();

 19     JPanel areamiddle=new JPanel();

 20     static JTextArea tl=new JTextArea();

 21     static JTextArea tr=new JTextArea();

 22     JScrollPane left=new JScrollPane(tl,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS,

 23             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED);

 24     JScrollPane right=new JScrollPane(tr,JScrollPane.VERTICAL_SCROLLBAR_ALWAYS,

 25             JScrollPane.HORIZONTAL_SCROLLBAR_AS_NEEDED);

 26     JSplitPane sp=new JSplitPane(JSplitPane.HORIZONTAL_SPLIT,left,right);

 27     static JButton toBig=new JButton("全部大写");

 28     static JButton delbd=new JButton("去掉标点");

 29     static JButton count=new JButton("计算相似度");

 30     JLabel space=new JLabel("                                               ");

 31     JLabel t1=new JLabel("Text1");

 32     JLabel t2=new JLabel("Text2");

 33 

 34     JMenuBar mb=new JMenuBar();

 35     JMenu open=new JMenu("打开");

 36     JMenuItem opent1=new JMenuItem("打开到Text1");

 37     JMenuItem opent2=new JMenuItem("打开到Text2");

 38     

 39     private String str="";

 40     public Outlook()

 41     {

 42         judge();

 43         

 44         frm.setVisible(true);

 45         frm.setBounds(50, 50, 500, 400);

 46         frm.setLayout(new BorderLayout(5,5));

 47         

 48         frm.add("North",areatop);

 49         frm.add("Center",areamiddle);

 50         frm.add("South",areabottom);

 51         

 52         areatop.add(mb);

 53         mb.add(open);        

 54         open.add(opent1);

 55         open.add(opent2);

 56         open.setPreferredSize(new Dimension(40,18));

 57         mb.setBackground(frm.getBackground());

 58         areatop.setLayout(new FlowLayout(FlowLayout.LEFT));

 59         areamiddle.setLayout(new FlowLayout(FlowLayout.LEFT));

 60         

 61         areamiddle.add(t1);

 62         t1.setPreferredSize(new Dimension(frm.getWidth()/2-20,10));

 63         areamiddle.add(t2);

 64         t2.setPreferredSize(new Dimension(50,10));

 65         areamiddle.add(left);

 66         left.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2));    

 67         areamiddle.add(right);

 68         right.setPreferredSize(new Dimension(frm.getWidth()/2-20,frm.getHeight()/2));

 69         tl.setLineWrap(true);

 70         tr.setLineWrap(true);

 71         

 72         areabottom.add(toBig);

 73         areabottom.add(delbd);

 74         areabottom.add(space);

 75         areabottom.add(count);

 76         

 77         opent1.addActionListener(new ActionListener(){

 78             public void actionPerformed(ActionEvent e) {

 79                 try {

 80                     openfile();

 81                     tl.setText(str);

 82                 } catch (IOException e1) {

 83                     e1.printStackTrace();

 84                 }

 85                 judge();

 86             }

 87         });

 88         opent2.addActionListener(new ActionListener(){

 89             public void actionPerformed(ActionEvent e) {

 90                 try {

 91                     openfile();

 92                     tr.setText(str);

 93                 } catch (IOException e1) {

 94                     e1.printStackTrace();

 95                 }

 96                 judge();

 97             }

 98         });

 99         toBig.addActionListener(new ActionListener(){

100             public void actionPerformed(ActionEvent e){

101                 tl.setText(tobig(tl.getText()));

102                 tr.setText(tobig(tr.getText()));

103             }

104         });

105         

106         delbd.addActionListener(new ActionListener(){

107             public void actionPerformed(ActionEvent e){

108                 tl.setText(del(tl.getText()));

109                 tr.setText(del(tr.getText()));

110                 judge();

111             }

112             

113         });

114         count.addActionListener(new ActionListener(){

115             public void actionPerformed(ActionEvent e){

116                 KShinglingAlgorithm a=new KShinglingAlgorithm();

117                 a.setText1(tl.getText());

118                 a.setText2(tr.getText());

119                 float b=a.getSimilarity(4);

120                 if(b!=-1)

121                     JOptionPane.showMessageDialog(null, Float.toString(b),"相似度", JOptionPane.INFORMATION_MESSAGE); 

122             }

123         });

124         tr.addKeyListener(new KeyAdapter(){

125             public void keyTyped(KeyEvent e){

126                 judge();

127             }

128         });

129         tl.addKeyListener(new KeyAdapter(){

130             public void keyTyped(KeyEvent e){

131                 judge();

132             }

133         });

134     }

135     public void judge(){

136         if(tl.getText().length()!=0||tr.getText().length()!=0) {

137             toBig.setEnabled(true);

138             delbd.setEnabled(true);

139             count.setEnabled(true);

140         }

141         else{

142             toBig.setEnabled(false);

143             delbd.setEnabled(false);

144             count.setEnabled(false);

145         }    

146     }

147     public void openfile() throws IOException{

148         str="";

149         JFileChooser choose=new JFileChooser();        

150         int result = choose.showOpenDialog(this);

151         File file = null; //注意初始化

152         //加过滤器

153         if (result == JFileChooser.APPROVE_OPTION) {

154             file = choose.getSelectedFile();

155             }

156         else{

157             return; //使点取消后不会抛出异常

158         }

159         FileReader fr=new FileReader(file);

160         BufferedReader br=new BufferedReader(fr);

161         char c[]=new char[512];

162         String strline="";

163         while(br.ready()){

164             strline=br.readLine();

165             str+=strline;

166         };

167         br.close();

168         fr.close();

169     }

170     public String tobig(String str){

171         String temp="";

172         for(int i=0;i<str.length();i++)

173         {

174             if(str.charAt(i)>='a'&&str.charAt(i)<='z')

175             {

176                 char t=str.charAt(i);

177                 t=(char)(str.charAt(i)-32);

178                 temp+=t;

179             }

180             else temp+=str.charAt(i);

181         }

182         return temp;

183     }

184     

185     public String del(String str){

186         String temp="";

187         for(int i=0;i<str.length();i++)

188         {

189             char t=str.charAt(i);

190             if(t>='!'&&t<='/'||t>=58&&t<=64||t>=91&&t<=96||t>=123&&t<=126);

191             else temp+=t;

192         }

193         return temp;

194     }

195     public static void main(String[] args){

196         new Outlook();

197         

198         

199     }

200 }
Outlook

 

你可能感兴趣的:(Algorithm)