最近在写开题报告,发现PDF中的文字复制出来会各种凌乱:标点全半角不定、各种空格莫名其妙的多。太影响复制粘贴了。
用js写了个解决问题的办法,基本就是用正则表达式替换。为了能方便的加入新标点转换,做了一下结构上的优化。
<html> <!-- trim all the spaces in input, and make the punctuations in right case--> <head> <meta charset="UTF-8"> <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"></script> <script type="text/javascript"> $(function(){ //bind events $('#format').click(format); $('#clear').click(clear); }); //replace strategy function Strategy(reg, rep){ this[this.REG] = reg; this[this.REP] = rep; } Strategy.prototype.REG = 0; Strategy.prototype.REP = 1; //replace utils function change(p1, p2, mapping){ return p1 + mapping[p2]; } function getRegOf(word, mapping){ var str = ''; for (var i in mapping){ str += '\\' + i; } return new RegExp('(' + word + ')([' + str + '])', 'g'); } // DBC to SBC case function en2cnChange(){ var args = arguments; return change(args[1], args[2], en2cnChange.prototype.mapping.mapping); } en2cnChange.prototype.mapping = { mapping : { '\,': ',', '\.': '。', '\;': ';', '\!': '!' }, reg: function(){ if(this._reg == undefined){ //initial once this._reg = getRegOf('\\W', this.mapping); } return this._reg; } } //SBC to DBC case function cn2enChange(){ var args = arguments; return change(args[1], args[2], cn2enChange.prototype.mapping.mapping); } cn2enChange.prototype.mapping = { mapping : { ',': '\,', '。': '\.', ';': '\;', '!': '\!' }, reg: function(){ if(this._reg == undefined){ //initial once this._reg = getRegOf('\\w', this.mapping); } return this._reg; } } // event handler function format(){ var str = $('#input').val(); var reg = Strategy.prototype.REG; var rep = Strategy.prototype.REP; var strategies = format.prototype.strategies; for(var i in strategies){ var strategy = strategies[i]; str = str.replace(strategy[reg], strategy[rep]); } $('#output').val(str); } format.prototype.strategies = [ new Strategy(en2cnChange.prototype.mapping.reg(), en2cnChange), new Strategy(cn2enChange.prototype.mapping.reg(), cn2enChange), new Strategy(/\s/g, function(){return ''})// whtie space ]; function clear(){ $('textarea').each(function(){$(this).val('')}); } </script> <style type="text/css"> textarea{ display: inline-block; width: 45%; height:80%; margin: 1em; } </style> </head> <textarea id="input" placeholder="input"></textarea> <textarea id="output" placeholder="output"></textarea> <br/> <button id="format">format</button> <button id="clear">clear</button> </html>