我相信大家都有过这样的需求,把doc、ppt、excel、pdf、txt中的文本内容提取出来。提取出来的文本内容可用于文档内容的全文索引,文档的基本内容摘要等。在度娘上搜索“如何提取文档内容”,确实有很多demo可以借鉴,但是,很多demo要么是需要付费的jar包,要么提取出的内容不全或者乱码。Java有许多开源工具包可用,尚且还不完美,何况其它一些开发语言如node、golang、ruby、python呢!
如果能有免费的API接口可以调用,那就不管是啥语言了,省时又省心,何乐而不为呢!基于此,我百度了下,没想到还真找到了。所以在此记录下,也希望能帮助到大家。
接口详细说明:https://www.xiaocongjisuan.com/show/api/11
接口介绍:提取word、ppt、pdf等文档中的内容返回给接口调用者
接口详细介绍中已经说的很清楚了,下面我简单说下注意事项:
1、关于appKey和openId
appKey:接口唯一标识,在用户后台->应用中心->我的接口查看
openId:平台id,注册后系统自动生成,在用户后台->用户中心->账户信息查看
2、关于文件传值
接口传值统一采用base64编码,由于get请求参数有长度限制,所以在调用api接口时,必须使用POST方式。【文档base64的编码值,请保持数据的完整性(可不带数据头),以TXT为例如:[“data:text/plain;base64,aHR0cHM6…”] 或 [“aHR0cHM6…”]】
3、错误码
接口返回的错误码,具体内容请查看接口详细说明
同时接口说明中也有各种开发语言的调用DEMO,如:java、python、php、c#、golang、nodeJS,其实不止上述这些语言,只要可以发出POST请求,就可以使用该接口,非常的方便。这个平台还有很多其它功能的接口,大部分是免费的。像什么天气预报、万年历、老黄历、中文分词、电影数据查询、电子书查询、网盘数据等等,大家慢慢去发现吧!下面贴一下文档内容提取的相关代码:
JAVA版本:
package com.xiaocongjisuan.module.example; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLEncoder; import java.util.HashMap; import java.util.Map; public class Application { public static final String DEF_CHATSET = "UTF-8"; public static final int DEF_CONN_TIMEOUT = 30000; public static final int DEF_READ_TIMEOUT = 30000; public static String userAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"; //配置您申请的appKey和openId public static final String APP_KEY ="yours"; public static final String OPEN_ID ="yours"; //将map型转为请求参数型 public static String urlEncode(Mapparams) { if(params==null){return "";}; StringBuilder sb = new StringBuilder(); for (Map.Entry i : params.entrySet()) { try { sb.append(i.getKey()).append("=").append(URLEncoder.encode(i.getValue()+"","UTF-8")).append("&"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } String r=sb.toString(); if(r.endsWith("&")){ r = r.substring(0,r.length()-1); } return r; } /** * * @param requestUrl 请求地址 * @param params 请求参数 * @param method 请求方法 * @return 请求结果 * @throws Exception */ public static String requestContent(String requestUrl, Map params,String method) throws Exception { HttpURLConnection conn = null; BufferedReader reader = null; String rs = null; try { //组装请求链接 StringBuffer sb = new StringBuffer(); if(method!=null&&method.equalsIgnoreCase("get")){ requestUrl = requestUrl+"?"+urlEncode(params); } //默认get URL url = new URL(requestUrl); conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); if(method!=null&&method.equalsIgnoreCase("post")){ conn.setRequestMethod("POST"); conn.setDoOutput(true); conn.setDoInput(true); } //参数配置 conn.setRequestProperty("User-agent", userAgent); conn.setUseCaches(false); conn.setConnectTimeout(DEF_CONN_TIMEOUT); conn.setReadTimeout(DEF_READ_TIMEOUT); conn.setInstanceFollowRedirects(false); conn.connect(); if (params!= null && method.equalsIgnoreCase("post")) { try { DataOutputStream out = new DataOutputStream(conn.getOutputStream()); out.writeBytes(urlEncode(params)); } catch (Exception e) { e.printStackTrace(); } } //读取数据 InputStream is = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(is, DEF_CHATSET)); String strRead = null; while ((strRead = reader.readLine()) != null) { sb.append(strRead); } rs = sb.toString(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { reader.close(); } if (conn != null) { conn.disconnect(); } } return rs; } public static void main(String[] args) throws Exception{ String domain="http://api.xiaocongjisuan.com/"; String servlet="develop/doccontent/analysis"; String method="post"; String requestUrl=domain+servlet; Map params=new HashMap (); params.put("appKey",APP_KEY); params.put("openId",OPEN_ID); String base64Doc=""; String docName="test.docx"; //变动部分 params.put("base64Doc",base64Doc); params.put("docName", docName); String result=requestContent(requestUrl,params,method); System.out.println(result); } }
PYTHON版本:
# -*- coding: utf-8 -*- # flake8: noqa __author__ = 'wukong' import urllib from urllib import urlencode #配置您申请的appKey和openId app_key="***" open_id="***" """ request_url 请求地址 params 请求参数 method 请求方法 """ def request_content(request_url,params,method): params = urlencode(params) if method and method.lower() =="get": f = urllib.urlopen("%s?%s" % (request_url, params)) else: f = urllib.urlopen(request_url, params) content = f.read() print content def main(): domain="http://api.xiaocongjisuan.com/"; servlet="develop/doccontent/analysis" method="post" request_url=domain+servlet #字典 params ={} params["docName"]=app_key params["openId"]=open_id #变动部分 base64Doc="" params["base64Doc"]=base64Doc params["docName"]="test.docx" request_content(request_url,params,method) if __name__ == '__main__': main()
PHP版本:
php /** * @author * @copyright 2019 */ header("content-type:text/html;charset=utf-8"); //设置编码 //配置您申请的appKey和openId $app_key = "***"; $open_id = "***"; /** $url 请求地址 $params 请求参数 $ispost 请求方法 */ function http_curl($url,$params=false,$ispost=false){ $httpInfo = array(); $ch = curl_init(); curl_setopt( $ch, CURLOPT_HTTP_VERSION , CURL_HTTP_VERSION_1_1 ); curl_setopt( $ch, CURLOPT_USERAGENT , "xiaocongjisuan"); curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT , 60 ); curl_setopt( $ch, CURLOPT_TIMEOUT , 60); curl_setopt( $ch, CURLOPT_RETURNTRANSFER , true ); if( $ispost ) { curl_setopt( $ch , CURLOPT_POST , true ); curl_setopt( $ch , CURLOPT_POSTFIELDS , $params ); curl_setopt( $ch , CURLOPT_URL , $url ); } else { if($params){ curl_setopt( $ch , CURLOPT_URL , $url.'?'.$params ); }else{ curl_setopt( $ch , CURLOPT_URL , $url); } } $response = curl_exec( $ch ); if ($response === FALSE) { //echo "cURL Error: " . curl_error($ch); return false; } $httpCode = curl_getinfo( $ch , CURLINFO_HTTP_CODE ); $httpInfo = array_merge( $httpInfo , curl_getinfo( $ch ) ); curl_close( $ch ); return $response; } function main(){ global $app_key; global $open_id; $domain="http://api.xiaocongjisuan.com/"; $servlet="develop/doccontent/analysis"; $method="get"; $url=$domain."".$servlet; $params['appKey']=$app_key; $params['openId']=$open_id; //变动部分 $params["base64Doc"]=""; $params["docName"]="test.docx"; //编码转换 foreach ($params as $key=>$value) { $params[$key]=mb_convert_encoding($value, "UTF-8", "GBK"); } $paramstring = http_build_query($params); $content = http_curl($url,$paramstring,true); return $content; } echo main(); ?>
C#版本:
using System; using System.Collections.Generic; using System.Text; using System.Net; using System.IO; namespace ConsoleApplication1 { class Program { private static string appKey="yours"; private static string openId = "yours"; static string getResponseAsString(HttpWebResponse rsp, Encoding encoding) { System.IO.Stream stream = null; StreamReader reader = null; try { // 以字符流的方式读取HTTP响应 stream = rsp.GetResponseStream(); reader = new StreamReader(stream, encoding); return reader.ReadToEnd(); } finally { // 释放资源 if (reader != null) reader.Close(); if (stream != null) stream.Close(); if (rsp != null) rsp.Close(); } } /* * parameters 参数 * encode 编码 */ static string buildQuery(IDictionary<string,object> parameters, string encode) { StringBuilder postData = new StringBuilder(); bool hasParam = false; IEnumeratorstring, object>> dem = parameters.GetEnumerator(); while (dem.MoveNext()) { string name = dem.Current.Key; string value = dem.Current.Value.ToString(); ; // 忽略参数名或参数值为空的参数 if (!string.IsNullOrEmpty(name))//&& !string.IsNullOrEmpty(value) { if (hasParam) { postData.Append("&"); } postData.Append(name); postData.Append("="); if (encode == "gb2312") { postData.Append(System.Web.HttpUtility.UrlEncode(value, Encoding.GetEncoding("gb2312"))); } else if (encode == "utf8") { postData.Append(System.Web.HttpUtility.UrlEncode(value, Encoding.UTF8)); } else { postData.Append(value); } hasParam = true; } } return postData.ToString(); } /** * * @param url 请求地址 * @param params 请求参数 * @param method 请求方法 * @return 请求结果 * @throws Exception */ static string requestContent(string url, IDictionary<string,object> parameters, string method) { if (method.ToLower() == "post") { HttpWebRequest req = null; HttpWebResponse rsp = null; System.IO.Stream reqStream = null; try { req = (HttpWebRequest)WebRequest.Create(url); req.Method = method; req.KeepAlive = false; req.ProtocolVersion = HttpVersion.Version10; req.Timeout = 5000; req.ContentType = "application/x-www-form-urlencoded;charset=utf-8"; byte[] postData = Encoding.UTF8.GetBytes(buildQuery(parameters, "utf8")); reqStream = req.GetRequestStream(); reqStream.Write(postData, 0, postData.Length); rsp = (HttpWebResponse)req.GetResponse(); Encoding encoding = Encoding.GetEncoding(rsp.CharacterSet); return getResponseAsString(rsp, encoding); } catch (Exception ex) { return ex.Message; } finally { if (reqStream != null) reqStream.Close(); if (rsp != null) rsp.Close(); } } else { //创建请求 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url + "?" + buildQuery(parameters, "utf8")); //GET请求 request.Method = "GET"; request.ReadWriteTimeout = 5000; request.ContentType = "text/html;charset=UTF-8"; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream myResponseStream = response.GetResponseStream(); StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8")); //返回内容 string retString = myStreamReader.ReadToEnd(); return retString; } } static void Main(string[] args) { String domain = "http://api.xiaocongjisuan.com/"; String servlet = "develop/doccontent/analysis"; String method = "post"; String url = domain + servlet; var parameters = new Dictionary<string,object>(); parameters.Add("appKey", appKey); parameters.Add("openId", openId); //变动部分 String base64Doc = ""; String docName = "test.docx"; parameters.Add("base64Doc", base64Doc); parameters.Add("docName", docName); string result = requestContent(url, parameters, method); Console.WriteLine(result); Console.Read(); } } }
等等…,其它语言,请看说明文档
由于接口采用的是base64传值,文件转base64对各个开发语言来说都非常简单。所以在此就不再赘述。以上