C# 获取文件字符集

temp.csx

//#! "netstandard2.0"
#r "nuget:NChardet,1.0.1.6"
#r "nuget:System.Text.Encoding.CodePages,4.5.0"
#r "nuget:System.Text.Encoding.Extensions,4.3.0"
using System.IO;
using System.Text;
using NChardet;

string fileName = @"E:\svn\erp\web\appsettings.json";
internal class MyCharsetDetectionObserver : ICharsetDetectionObserver
{
    internal string Charset;

    public void Notify(string charset)
    {
        Charset = charset;
    }
}

MyCharsetDetectionObserver cdo = new MyCharsetDetectionObserver();

string GetFileEncoding(string fileName){
    Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
    cdo.Charset = null;
    Detector detector = new Detector();
    detector.Init(cdo);
    const int BLOCK = 1024;
    bool isAscii = false;
    using (FileStream fileStream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read))
    {
        byte[] buffer = new byte[BLOCK];
        int count = fileStream.Read(buffer, 0, BLOCK);
        while(count > 0)
        {
            isAscii = detector.isAscii(buffer, count);
            if(isAscii)
            {
                cdo.Charset = Encoding.ASCII.BodyName;
                break;
            }
            
            if (detector.DoIt(buffer, count, false))
            {
                break;
            }
            count = fileStream.Read(buffer, 0, BLOCK);
        }
    }

    if(isAscii)
    {
       detector.DataEnd();
    }
    return cdo.Charset;
}
//Console.WriteLine("Charset:{0}",GetFileEncoding(fileName));
//Console.WriteLine("DefaultEncoding:{0}",Encoding.Default.BodyName);
/*
Encoding src = Encoding.GetEncoding("GB2312");
Encoding dest = Encoding.UTF8;
string str = "中国";
byte[] bs = src.GetBytes(str);
Console.WriteLine("src:"+BitConverter.ToString(bs));
bs = Encoding.Convert(src,dest,bs);
Console.WriteLine("dest:"+BitConverter.ToString(bs));
Console.WriteLine("str:"+dest.GetString(bs));
*/

//注册GB2312字符集
Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
/*
Encoding encoding = Encoding.GetEncoding("gb2312");
Console.WriteLine(encoding == null?"gb2312 not found":encoding.CodePage.ToString());
Console.WriteLine(string.Join("\r\n",Encoding.GetEncodings().Select(e=>string.Format("{0},{1}",e.CodePage,e.Name))));
*/


string dir = @"E:\git\erp\src";
Console.WriteLine(dir);
string[] files = Directory.GetFiles(dir,"*.cs",SearchOption.AllDirectories);
foreach(string file in files)
{
    string fileEncoding = GetFileEncoding(file);
    Console.Write("{0} Charset:{1}",file.Replace(dir,string.Empty),fileEncoding);
    if(string.IsNullOrEmpty(fileEncoding))
    {
        fileEncoding = "gb2312";
    }
    try
    {
        Encoding srcEncoding = Encoding.GetEncoding(fileEncoding);
        if(srcEncoding != Encoding.UTF8)
        {
            Console.Write(" convert to UTF8");
            File.WriteAllBytes(file,Encoding.UTF8.GetBytes(File.ReadAllText(file,srcEncoding)));
        }
    }
    catch(Exception ex)
    {
        Console.Write(ex.Message);
    }
    //这种转换会报错
    //File.WriteAllBytes(file,Encoding.Convert(fileEncoding,Encoding.UTF8,File.ReadAllBytes(file)));
    Console.WriteLine();
}

执行:dotnet-script temp.csx

C# 获取文件字符集_第1张图片

你可能感兴趣的:(C# 获取文件字符集)