Text文档编码识别方法

阅读更多
http://www.cnblogs.com/preacher/p/6084802.html

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

namespace 文章操作工具
{
    public class TextHelper
    {
        public static System.Text.Encoding GetType(string filename)
        {
            FileStream fs = new FileStream(filename, FileMode.Open, FileAccess.Read);
            System.Text.Encoding r = GetType(fs);
            fs.Close();
            return r;
        }
       
        public static System.Text.Encoding GetType(FileStream fs)
        {
            /*
                                Unicode   
                                ------------------
   254

                                ======================
                                UnicodeBigEndian
                                -------------------
   255

                                ======================
                                UTF8
                                -------------------
228
229
230
231
232
233
   187
            
                                ======================
                                ANSI
                                -------------------
176
177
179
180
182
185
191
194
196
198
201
202
205
206
208
209
210
211
213
167
213
228
            */
            BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
            byte[] ss = r.ReadBytes(3);
            int lef = ss[0];
            int mid = ss[1];
            int rig = ss[2];
            r.Close();
            /*  文件头两个字节是255 254,为Unicode编码;
                文件头三个字节  254 255 0,为UTF-16BE编码;
                文件头三个字节  239 187 191,为UTF-8编码;*/
            if (lef == 255 && mid == 254)
            {
                return Encoding.Unicode;
            }
            else if (lef == 254 && mid == 255 && rig == 0)
            {
                return Encoding.BigEndianUnicode;
            }
            else if (lef == 254 && mid == 255)
            {
                return Encoding.BigEndianUnicode;
            }
            else if (lef == 239 && mid == 187 && rig == 191)
            {
                return Encoding.UTF8;
            }
            else if (lef == 239 && mid == 187)
            {
                return Encoding.UTF8;
            }
            else if (lef == 196 && mid == 167
                || lef == 206 && mid == 228
                || lef == 202 && mid == 213)
            {
                return Encoding.Default;
            }
            else
            {
                if (lef == 34)
                {
                    if (mid < 220) return Encoding.Default;
                    else return Encoding.UTF8;
                }
                else
                {
                    if (lef < 220) return Encoding.Default;
                    else return Encoding.UTF8;
                }
            }
        }
    }
}

你可能感兴趣的:(Text文档编码识别方法)