* jchardet
* juniversalchardet
などが見つかったけど、とりあえず、標準で逃げる方法。
- private final List<CharsetDecoder> decoders;
- {
- // 色々試した結果、この順序が必須
- String[] names = new String[] { "ISO-2022-JP", "EUC-JP", "UTF-8",
- "windows-31j" };
- decoders = new LinkedList<CharsetDecoder>();
- for (String name : names) {
- decoders.add(Charset.forName(name).newDecoder());
- }
- }
- public Charset detectEncoding(byte[] bytes) throws Exception {
- for (CharsetDecoder d : decoders) {
- try {
- d.decode(ByteBuffer.wrap(bytes));
- } catch (CharacterCodingException e) {
- continue;
- }
- return d.charset();
- }
- throw new IllegalArgumentException("デコードできませんでした。");
- }
- public void testDetectEncoding() throws Exception {
- String[] samples = new String[] { "平", "カ", "1", "ひ", "b", };
- for (String s : samples) {
- assertEquals("windows-31j", detectEncoding(s.getBytes("sjis")).toString());
- assertEquals("UTF-8", detectEncoding(s.getBytes("utf-8")).toString());
- assertEquals("EUC-JP", detectEncoding(s.getBytes("euc_jp")).toString());
- assertEquals("ISO-2022-JP", detectEncoding(s.getBytes("jis")).toString());
- }
- }