如何高效的完成中文分词?

在说分词之前,笔者先来介绍下何为分词:分词就是将连续的字序列按照一定的规范重新组合成词序列的过程。英文中,单词之间是以空格作为自然分界符的,但是中文的分词就复杂多了,要涉及一些算法,对于初学者来说,还是有很多难度的。这里笔者只介绍一种最简单的方式,有兴趣的朋友可以看下,直接上代码,python实现方式

# -*- coding: utf-8 -*-
# flake8: noqa
__author__ = 'wukong'
 
import urllib
from urllib import urlencode
 
#配置您申请的appKey和openId
app_key="***"
open_id="***"
 
"""
request_url 请求地址
params 请求参数
method 请求方法
"""
def request_content(request_url,params,method):
    params = urlencode(params)
    
    if method and method.lower() =="get":
        f = urllib.urlopen("%s?%s" % (request_url, params))
    else:
        f = urllib.urlopen(request_url, params)
 
    content = f.read()
    print content
 
   
def main():
    
    domain="http://api.xiaocongjisuan.com/"
    servlet="data/chinesekeyword/analysis"
    method="get"
    request_url=domain+servlet
    
    #字典
    params ={}
    params["appKey"]=app_key
    params["openId"]=open_id
    
    #变动部分
    params["content"]="我是一个中国人,你知道嘛"
    
    request_content(request_url,params,method)
    
if __name__ == '__main__':
    main()

java 为例:

package com.xiaocongjisuan.module.example;
 
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
 
public class Application {
    
     public static final String DEF_CHATSET = "UTF-8";
     public static final int DEF_CONN_TIMEOUT = 30000;
     public static final int DEF_READ_TIMEOUT = 30000;
     public static String userAgent =  "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36";
     
     //配置您申请的appKey和openId
     public static final String APP_KEY ="yours";
     public static final String OPEN_ID ="yours";
     
     //将map型转为请求参数型
     public static String urlEncode(Map<String,Object> params) {
        
        if(params==null){return "";};
         
        StringBuilder sb = new StringBuilder();
        for (Map.Entry<String,Object> i : params.entrySet()) {
            try {
                sb.append(i.getKey()).append("=").append(URLEncoder.encode(i.getValue()+"","UTF-8")).append("&");
            } catch (UnsupportedEncodingException e) {
                e.printStackTrace();
            }
        }
        String r=sb.toString();
        if(r.endsWith("&")){
            r = r.substring(0,r.length()-1);
        }
        return r;
     }
     
     /**
     *
     * @param requestUrl 请求地址
     * @param params 请求参数
     * @param method 请求方法
     * @return 请求结果
     * @throws Exception
     */
     public static String requestContent(String requestUrl, Map<String,Object> params,String method) throws Exception {
        
        HttpURLConnection conn = null;
        BufferedReader reader = null;
        String rs = null;
        try {
 
            //组装请求链接
            StringBuffer sb = new StringBuffer();
            
            if(method!=null&&method.equalsIgnoreCase("get")){
                requestUrl = requestUrl+"?"+urlEncode(params);
            }
 
            //默认get
            URL url = new URL(requestUrl);
            conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            
            if(method!=null&&method.equalsIgnoreCase("post")){
                 conn.setRequestMethod("POST");
                 conn.setDoOutput(true);
                 conn.setDoInput(true);
            }
 
            //参数配置
            conn.setRequestProperty("User-agent", userAgent);
            conn.setUseCaches(false);
            conn.setConnectTimeout(DEF_CONN_TIMEOUT);
            conn.setReadTimeout(DEF_READ_TIMEOUT);
            conn.setInstanceFollowRedirects(false);
            conn.connect();
            
            if (params!= null && method.equalsIgnoreCase("post")) {
                try {
                    DataOutputStream out = new DataOutputStream(conn.getOutputStream());
                    out.writeBytes(urlEncode(params));
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            
            //读取数据
            InputStream is = conn.getInputStream();
            reader = new BufferedReader(new InputStreamReader(is, DEF_CHATSET));
            String strRead = null;
            while ((strRead = reader.readLine()) != null) {
                sb.append(strRead);
            }
            rs = sb.toString();
            
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                reader.close();
            }
            if (conn != null) {
                conn.disconnect();
            }
        }
        return rs;
    }
    
    
    public static void main(String[] args) throws Exception{
        
        String domain="http://api.xiaocongjisuan.com/";
        String servlet="data/skydriverdata/get";
        String method="get";
        
        String requestUrl=domain+servlet;
        Map<String,Object> params=new HashMap<String,Object>();
        params.put("appKey",APP_KEY);
        params.put("openId",OPEN_ID);
        
        //变动部分
        params.put("q","a");
        params.put("currentPage",1);
        params.put("pageSize",20);
        
        
        String result=requestContent(requestUrl,params,method);
        System.out.println(result);
    }
}

原理主要是调用接口,直接输入一串字符串,然后接口会自动把结果以json或者xml的形式返回,具体文档可以点我查看。这种实现方式很简单,省去了大量的开发时间,屏蔽了语言之间的差异性,值得推荐。

你可能感兴趣的:(如何高效的完成中文分词?)