hikozaemonchan: 6月 2009

javaにはエンコード自動認識が標準ではなかったので、探し回った。
* jchardet
* juniversalchardet
などが見つかったけど、とりあえず、標準で逃げる方法。

private final List<CharsetDecoder> decoders;
{
// 色々試した結果、この順序が必須
String[] names = new String[] { "ISO-2022-JP", "EUC-JP", "UTF-8",
"windows-31j" };
decoders = new LinkedList<CharsetDecoder>();
for (String name : names) {
decoders.add(Charset.forName(name).newDecoder());
}
}
public Charset detectEncoding(byte[] bytes) throws Exception {
for (CharsetDecoder d : decoders) {
try {
d.decode(ByteBuffer.wrap(bytes));
} catch (CharacterCodingException e) {
continue;
}
return d.charset();
}
throw new IllegalArgumentException("デコードできませんでした。");
}
public void testDetectEncoding() throws Exception {
String[] samples = new String[] { "平", "カ", "１", "ひ", "ｂ", };
for (String s : samples) {
assertEquals("windows-31j", detectEncoding(s.getBytes("sjis")).toString());
assertEquals("UTF-8", detectEncoding(s.getBytes("utf-8")).toString());
assertEquals("EUC-JP", detectEncoding(s.getBytes("euc_jp")).toString());
assertEquals("ISO-2022-JP", detectEncoding(s.getBytes("jis")).toString());
}
}


private final List<CharsetDecoder> decoders;
 {
 // 色々試した結果、この順序が必須
 String[] names = new String[] { "ISO-2022-JP", "EUC-JP", "UTF-8",
   "windows-31j" };
 decoders = new LinkedList<CharsetDecoder>();
 for (String name : names) {
  decoders.add(Charset.forName(name).newDecoder());
 }
}
public Charset detectEncoding(byte[] bytes) throws Exception {
 for (CharsetDecoder d : decoders) {
  try {
   d.decode(ByteBuffer.wrap(bytes));
  } catch (CharacterCodingException e) {
   continue;
  }
  return d.charset();
 }
 throw new IllegalArgumentException("デコードできませんでした。");
}

public void testDetectEncoding() throws Exception {
 String[] samples = new String[] { "平", "カ", "１", "ひ", "ｂ", };
 for (String s : samples) {
  assertEquals("windows-31j", detectEncoding(s.getBytes("sjis")).toString());
  assertEquals("UTF-8", detectEncoding(s.getBytes("utf-8")).toString());
  assertEquals("EUC-JP", detectEncoding(s.getBytes("euc_jp")).toString());
  assertEquals("ISO-2022-JP", detectEncoding(s.getBytes("jis")).toString());
 }
}

hikozaemonchan

2009年6月19日金曜日

Java detect Encoding for Japanese

ブログアーカイブ

Blogger Syntax Highliter

hikozaemonchan

2009年6月19日金曜日

Java detect Encoding for Japanese

ブログ アーカイブ

Blogger Syntax Highliter

ブログアーカイブ