Illegal mix of collations (utf8_general_ci,IMPLICIT) and (utf8mb4_general_ci,COERCIBLE)for operation

现象

Illegal mix of collations (utf8_general_ci,IMPLICIT) and (utf8mb4_general_ci,COERCIBLE) for operation ‘like’ (errno 1267) (sqlstate HY000) during query

原因

在mysql中设置utf8-general-ci时是无法插入4字节utf-8字符的,如果mysql中设置了utf8-general-ci在插入4字节字符时则会出问题,

解决方案

为了避免此类问题, 可以提前对字符串做判断, 判断方法的代码片段如下:

package com.jd.market.sc.utils;

import org.junit.Assert;
import org.junit.Test;

import java.nio.charset.Charset;

/**
 * 处理由于mysql配置字符编码utf8 general ci, 而应用会接受utf8-mb4字节的情况
 * Created by zhaoyukai on 2017/9/28.
 */
public class UTF8MB4 {
    private final static Charset UTF8 = Charset.forName("UTF-8");

    /**
     * 判断字符串中是否存在4字节字符
     *
     * @param input 输入字符串
     * @return 包含4字节返回true, 否则为false
     */
    public static boolean containsMb4Char(String input) {
        if (input == null) {
            return false;
        }
        byte[] bytes = input.getBytes(UTF8);
        for (int i = 0; i < bytes.length; i++) {
            byte b = bytes[i];
            //four bytes
            if ((b & 0XF0) == 0XF0) {
                return true;
            } else if ((b & 0XE0) == 0XE0) {
                //three bytes
                //forward 2 byte
                i += 2;
            } else if ((b & 0XC0) == 0XC0) {
                i += 1;
            }
        }
        return false;
    }

    /**
     * 替换可能存在的utf8 4字节字符
     *
     * @param input       输入字符串
     * @param replacement 替换为的字符串
     * @return 替换后的utf8字符串
     */
    public static String replaceMb4Char(String input, String replacement) {
        if (input == null) {//为空直接返回
            throw new IllegalArgumentException("input can not be null when replaceMb4Char");
        }
        if(!containsMb4Char(input)){//不包含四字节的utf8mb4字符,直接返回原值
            return input;
        }
        //包含四字节的utf8mb4字符
        StringBuilder sb = new StringBuilder(input.length());
        byte[] bytes = input.getBytes(UTF8);
        char[] chars = input.toCharArray();
        int charIdx = 0;
        for (int i = 0; i < bytes.length; i++) {
            byte b = bytes[i];
            //four bytes
            if ((b & 0XF0) == 0XF0) {
                sb.append(replacement);
                //utf-8四字节字符unicode后变为2个字符, 故字符下标多加1
                charIdx += 2;
                i += 3;
                continue;
            } else if ((b & 0XE0) == 0XE0) {
                //three bytes
                //forward 2 byte
                i += 2;
            } else if ((b & 0XC0) == 0XC0) {
                i += 1;
            }
            sb.append(chars[charIdx]);
            charIdx++;
        }
        return sb.toString();
    }

    @Test
    public void testReplacement() {
        String input = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94哈哈哈";
        String output = UTF8MB4.replaceMb4Char(input, "");
        String expect = "A啊中哈哈哈";
        Assert.assertEquals(expect, output);
    }

    @Test
    public void testContainsMb4() {
        testFalse("呵呵呵");
        testFalse("AAAA");
        testFalse(",,,");
        testFalse("中国。。,&………………");
        testFalse("我们mmm他们..你们abcdddd牛m");
        testTrue("中山大?");
        testTrue("\uD840\uDC87");
        testTrue("?");
        testTrue("【好玩吧、吧?秒审核不墨迹(好玩吧)赏2.3元】");
    }

    @Test
    public void testReplacePerf() {
        long startMm = System.currentTimeMillis();
        int times = 10000000;
        while (times > 0) {
            UTF8MB4.replaceMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941", "");
            times--;
        }
        long end = System.currentTimeMillis();
        long used = end - startMm;
        System.out.println(used);
    }

    @Test
    public void testContainsPerf() {
        long startMm = System.currentTimeMillis();
        int times = 10000000;
        while (times > 0) {
            UTF8MB4.containsMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941");
            times--;
        }
        long end = System.currentTimeMillis();
        long used = end - startMm;
        System.out.println(used);
    }

    void testTrue(String input) {
        boolean contains = UTF8MB4.containsMb4Char(input);
        Assert.assertTrue(contains);
    }

    void testFalse(String input) {
        boolean contains = UTF8MB4.containsMb4Char(input);
        Assert.assertFalse(contains);
    }

    @Test
    public void testMb4() {
        String chs = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94";

//        byte[] bytes = chs.getBytes(UTF8);
        byte[] masks = {
                (byte) 0X00,
                (byte) 0X80,
                (byte) 0XC0,
                (byte) 0XE0,
        };
        char c = 'ﭾ';
        int count = 1;
        System.out.println(String.format("%02X", (int) c));
        while (count < 110000) {
            char nc = (char) ((int) c + count);
            System.out.print(nc);
            byte[] bytes = String.valueOf(nc).getBytes(UTF8);
            System.out.print(String.format("%02X", bytes[0]));
            count++;
            if (count % 50 == 0) {
                System.out.println();
            }
        }
    }
}

转载自:http://outofmemory.cn/code-snippet/82878/java-utf-8-codec-utf8-mb4

你可能感兴趣的:(开发中遇到的坑点)