diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj
index cf905c0f9f..a89b337a14 100644
--- a/sakura/sakura.vcxproj
+++ b/sakura/sakura.vcxproj
@@ -303,6 +303,7 @@
+
@@ -413,6 +414,7 @@
+
@@ -647,6 +649,7 @@
+
@@ -776,6 +779,7 @@
+
diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters
index 0b21690eb5..be2661ada1 100644
--- a/sakura/sakura.vcxproj.filters
+++ b/sakura/sakura.vcxproj.filters
@@ -119,6 +119,9 @@
{930f3f82-ab3f-49e3-af4a-d4f9c2d51f46}
+
+ {e4629f85-3be8-4dda-80db-1be310929433}
+
@@ -1085,6 +1088,12 @@
Cpp Source Files\mem
+
+ Cpp Source Files\extmodule
+
+
+ Cpp Source Files\charset\icu4c
+
@@ -2252,6 +2261,12 @@
Cpp Source Files\dlg
+
+ Cpp Source Files\extmodule
+
+
+ Cpp Source Files\charset\icu4c
+
diff --git a/sakura_core/Makefile b/sakura_core/Makefile
index 5ea88e13f6..55d68a600b 100644
--- a/sakura_core/Makefile
+++ b/sakura_core/Makefile
@@ -115,6 +115,7 @@ charset/CUnicode.o \
charset/CUnicodeBe.o \
charset/CUtf7.o \
charset/CUtf8.o \
+charset/icu4c/CharsetDetector.o \
cmd/CViewCommander.o \
cmd/CViewCommander_Bookmark.o \
cmd/CViewCommander_Clipboard.o \
@@ -228,6 +229,7 @@ extmodule/CBregexp.o \
extmodule/CBregexpDll2.o \
extmodule/CDllHandler.o \
extmodule/CHtmlHelp.o \
+extmodule/CIcu4cI18n.o \
extmodule/CMigemo.o \
extmodule/CUxTheme.o \
func/CFuncKeyWnd.o \
diff --git a/sakura_core/_os/CClipboard.cpp b/sakura_core/_os/CClipboard.cpp
index a6a6e7fade..3673c49488 100644
--- a/sakura_core/_os/CClipboard.cpp
+++ b/sakura_core/_os/CClipboard.cpp
@@ -605,11 +605,7 @@ bool CClipboard::GetClipboradByFormat(CNativeW& mem, const wchar_t* pFormatName,
}else{
ECodeType eMode = (ECodeType)nMode;
if( !IsValidCodeType(eMode) ){
- // コード不明と99は自動判別
- ECodeType nBomCode = CCodeMediator::DetectUnicodeBom((const char*)pData, nLength);
- if( nBomCode != CODE_NONE ){
- eMode = nBomCode;
- }else{
+ {
const STypeConfig& type = CEditDoc::GetInstance(0)->m_cDocType.GetDocumentAttribute();
CCodeMediator mediator(type.m_encoding);
eMode = mediator.CheckKanjiCode((const char*)pData, nLength);
diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp
index 2d1515f177..c978dd2b51 100644
--- a/sakura_core/charset/CCodeMediator.cpp
+++ b/sakura_core/charset/CCodeMediator.cpp
@@ -1,147 +1,9 @@
/*! @file */
#include "StdAfx.h"
#include "charset/CCodeMediator.h"
-#include "charset/charcode.h"
+#include "charset/icu4c/CharsetDetector.h"
#include "charset/CESI.h"
#include "io/CBinaryStream.h"
-#include "types/CType.h"
-
-/*!
- 文字列の先頭にUnicode系BOMが付いているか?
-
- @retval CODE_UNICODE UTF-16 LE
- @retval CODE_UTF8 UTF-8
- @retval CODE_UNICODEBE UTF-16 BE
- @retval CODE_NONE 未検出
-
- @date 2007.08.11 charcode.cpp から移動
-*/
-ECodeType CCodeMediator::DetectUnicodeBom( const char* pS, const int nLen )
-{
- uchar_t *pBuf;
-
- if( NULL == pS ){ return CODE_NONE; }
-
- pBuf = (uchar_t *) pS;
- if( 2 <= nLen ){
- if( pBuf[0] == 0xff && pBuf[1] == 0xfe ){
- return CODE_UNICODE;
- }
- if( pBuf[0] == 0xfe && pBuf[1] == 0xff ){
- return CODE_UNICODEBE;
- }
- if( 3 <= nLen ){
- if( pBuf[0] == 0xef && pBuf[1] == 0xbb && pBuf[2] == 0xbf ){
- return CODE_UTF8;
- }
- }
- }
-#if 0
-// 2015.03.05 Moca UTF-7 BOMは無効に変更
-// もしデータがASCII互換でUTF-7として正しければ、文字コード比較でUTF-7になるはず
- if( 4 <= nLen ){
- if( memcmp( pBuf, "+/v", 3 ) == 0
- && ( pBuf[3] == '8' || pBuf[3] == '9' || pBuf[3] == '+' || pBuf[3] == '/' ) ){
- return CODE_UTF7;
- }
- }
-#endif
- return CODE_NONE;
-}
-
-/*!
- SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)
-
- @return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す.
-
- @note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
-*/
-ECodeType CCodeMediator::DetectMBCode( CESI* pcesi )
-{
-// pcesi->m_dwStatus = ESI_NOINFORMATION;
-
- if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
- // 不正バイトの割合が、全体の 0.05% 未満であることを確認。
- // 全体の0.05%ほどの不正バイトは、無視する。
- pcesi->SetStatus( ESI_NODETECTED );
- return CODE_NONE;
- }
- if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
- pcesi->SetStatus( ESI_NODETECTED );
- return CODE_NONE;
- }
-
- /*
- 判定状況を確認
- */
- pcesi->SetStatus( ESI_MBC_DETECTED );
- return pcesi->m_apMbcInfo[0]->eCodeID;
-}
-
-/*!
- UTF-16 LE/BE を判定.
-
- @retval CODE_UNICODE UTF-16 LE が検出された
- @retval CODE_UNICODEBE UTF-16 BE が検出された
- @retval 0 UTF-16 LE/BE ともに検出されなかった
-
-*/
-ECodeType CCodeMediator::DetectUnicode( CESI* pcesi )
-{
-// pcesi->m_dwStatus = ESI_NOINFORMATION;
-
- EBOMType ebom_type = pcesi->GetBOMType();
- int ndatalen;
- int nlinebreak;
-
- if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
- pcesi->SetStatus( ESI_NODETECTED );
- return CODE_NONE;
- }
-
- // 1行の平均桁数が200を超えている場合はUnicode未検出とする
- ndatalen = pcesi->GetDataLen();
- nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得
- if( static_cast(ndatalen) / nlinebreak > 200 ){
- pcesi->SetStatus( ESI_NODETECTED );
- return CODE_NONE;
- }
-
- pcesi->SetStatus( ESI_WC_DETECTED );
- return pcesi->m_aWcInfo[ebom_type].eCodeID;
-}
-
-/*
- 日本語コードセット判定
-*/
-ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
-{
- ECodeType nret;
-
- /*
- 判定状況は、
- DetectMBCode(), DetectUnicode() 内で
- cesi.m_dwStatus に記録する。
- */
-
- if( pcesi == NULL ){
- return CODE_DEFAULT;
- }
- if( pcesi->GetMetaName() != CODE_NONE ){
- return pcesi->GetMetaName();
- }
- nret = DetectUnicode( pcesi );
- if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
- return nret;
- }
- nret = DetectMBCode( pcesi );
- if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
- return nret;
- }
-
- // デフォルト文字コードを返す
- return pcesi->m_pEncodingConfig->m_eDefaultCodetype;
-}
/*
日本語コードセット判別
@@ -155,18 +17,22 @@ ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
UTF-7 CODE_UTF7
UnicodeBE CODE_UNICODEBE
*/
-ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
+ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept
{
- CESI cesi(*m_pEncodingConfig);
+ // 0バイトならタイプ別のデフォルト設定
+ if (size == 0) {
+ return m_sEncodingConfig.m_eDefaultCodetype;
+ }
- /*
- 判定状況は、
- DetectMBCode(), DetectUnicode() 内で
- cesi.m_dwStatus に記録する。
- */
+ // ICU4CのDLL群が利用できる場合、ICU4Cによる判定を試みる
+ CharsetDetector csd;
+ if (csd.IsAvailable()) {
+ auto code = csd.Detect(std::string_view(buff, size));
+ if (code != CODE_ERROR) return code;
+ }
- cesi.SetInformation( pBuf, nBufLen/*, CODE_SJIS*/ );
- return CheckKanjiCode( &cesi );
+ CESI cesi(m_sEncodingConfig);
+ return cesi.CheckKanjiCode(buff, size);
}
/*
@@ -182,8 +48,12 @@ ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
|| UnicodeBE CODE_UNICODEBE
|| エラー CODE_ERROR
*/
-ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
+ECodeType CCodeMediator::CheckKanjiCodeOfFile(const WCHAR* pszFile)
{
+ if (!pszFile) {
+ return CODE_ERROR;
+ }
+
// オープン
CBinaryInputStream in(pszFile);
if(!in){
@@ -191,33 +61,21 @@ ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
}
// データ長取得
- int nBufLen = in.GetLength();
- if( nBufLen > CheckKanjiCode_MAXREADLENGTH ){
- nBufLen = CheckKanjiCode_MAXREADLENGTH;
- }
-
- // 0バイトならタイプ別のデフォルト設定
- if( 0 == nBufLen ){
- return m_pEncodingConfig->m_eDefaultCodetype;
- }
+ auto size = std::min(in.GetLength(), CheckKanjiCode_MAXREADLENGTH);
- // データ確保
- CMemory cMem;
- cMem.AllocBuffer(nBufLen);
- void* pBuf = cMem.GetRawPtr();
+ std::unique_ptr buff;
+ if (size > 0)
+ {
+ // データ確保
+ buff = std::make_unique(size);
- // 読み込み
- nBufLen = in.Read(pBuf, nBufLen);
+ // 読み込み
+ auto ret = in.Read(buff.get(), size);
+ }
// クローズ
in.Close();
// 日本語コードセット判別
- ECodeType nCodeType = DetectUnicodeBom( reinterpret_cast(pBuf), nBufLen );
- if( nCodeType == CODE_NONE ){
- // Unicode BOM は検出されませんでした.
- nCodeType = CheckKanjiCode( reinterpret_cast(pBuf), nBufLen );
- }
-
- return nCodeType;
+ return CheckKanjiCode(buff.get(), size);
}
diff --git a/sakura_core/charset/CCodeMediator.h b/sakura_core/charset/CCodeMediator.h
index 48348d9c0d..6d7af2702b 100644
--- a/sakura_core/charset/CCodeMediator.h
+++ b/sakura_core/charset/CCodeMediator.h
@@ -24,30 +24,27 @@
*/
#pragma once
-#include "charset/CESI.h"
-class CEditDoc;
-
-class CCodeMediator{
-protected:
- // CESI.cpp の判定関数をここに移す
- static ECodeType DetectMBCode( CESI* pcesi );
- static ECodeType DetectUnicode( CESI* pcesi );
-
+#include "types/CType.h" //SEncodingConfig
+
+/*!
+ * @brief CCodeMediator クラス
+ *
+ * 日本語コードセット判別の詳細を隠ぺいするための仲介クラスです。
+ */
+class CCodeMediator final {
public:
-
- explicit CCodeMediator( const SEncodingConfig &ref ) : m_pEncodingConfig(&ref) { }
-
- static ECodeType DetectUnicodeBom( const char* pS, const int nLen );
+ explicit CCodeMediator(const SEncodingConfig &encodingConfig) noexcept
+ : m_sEncodingConfig(encodingConfig)
+ {
+ }
/* 日本語コードセット判別 */
- ECodeType CheckKanjiCode( const char* pBuf, int nBufLen );
+ ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
/* ファイルの日本語コードセット判別 */
- ECodeType CheckKanjiCodeOfFile( const WCHAR* pszFile );
-
- static ECodeType CheckKanjiCode( CESI* pcesi ); // CESI 構造体(?)を外部で構築した場合に使用
+ ECodeType CheckKanjiCodeOfFile(const WCHAR* pszFile);
private:
- const SEncodingConfig* m_pEncodingConfig;
+ const SEncodingConfig& m_sEncodingConfig;
};
/*[EOF]*/
diff --git a/sakura_core/charset/CESI.cpp b/sakura_core/charset/CESI.cpp
index 1bd905c234..79d73c04a7 100644
--- a/sakura_core/charset/CESI.cpp
+++ b/sakura_core/charset/CESI.cpp
@@ -1134,6 +1134,103 @@ ECodeType CESI::AutoDetectByCoding( const char* pBuf, int nSize )
return CODE_NONE;
}
+/*!
+ SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)
+
+ @return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す.
+
+ @note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
+*/
+static ECodeType DetectMBCode( CESI* pcesi )
+{
+// pcesi->m_dwStatus = ESI_NOINFORMATION;
+
+ if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
+ // 不正バイトの割合が、全体の 0.05% 未満であることを確認。
+ // 全体の0.05%ほどの不正バイトは、無視する。
+ pcesi->SetStatus( ESI_NODETECTED );
+ return CODE_NONE;
+ }
+ if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
+ pcesi->SetStatus( ESI_NODETECTED );
+ return CODE_NONE;
+ }
+
+ /*
+ 判定状況を確認
+ */
+ pcesi->SetStatus( ESI_MBC_DETECTED );
+ return pcesi->m_apMbcInfo[0]->eCodeID;
+}
+
+/*!
+ UTF-16 LE/BE を判定.
+
+ @retval CODE_UNICODE UTF-16 LE が検出された
+ @retval CODE_UNICODEBE UTF-16 BE が検出された
+ @retval 0 UTF-16 LE/BE ともに検出されなかった
+
+*/
+static ECodeType DetectUnicode( CESI* pcesi )
+{
+// pcesi->m_dwStatus = ESI_NOINFORMATION;
+
+ EBOMType ebom_type = pcesi->GetBOMType();
+ int ndatalen;
+ int nlinebreak;
+
+ if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
+ pcesi->SetStatus( ESI_NODETECTED );
+ return CODE_NONE;
+ }
+
+ // 1行の平均桁数が200を超えている場合はUnicode未検出とする
+ ndatalen = pcesi->GetDataLen();
+ nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得
+ if( static_cast(ndatalen) / nlinebreak > 200 ){
+ pcesi->SetStatus( ESI_NODETECTED );
+ return CODE_NONE;
+ }
+
+ pcesi->SetStatus( ESI_WC_DETECTED );
+ return pcesi->m_aWcInfo[ebom_type].eCodeID;
+}
+
+/*
+ 日本語コードセット判定
+*/
+ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept
+{
+
+ // 日本語コードセット判別
+ ECodeType nCodeType = DetectUnicodeBom(pBuf, nBufLen);
+ if (nCodeType != CODE_NONE) {
+ return nCodeType;
+ }
+
+ /*
+ 判定状況は、
+ DetectMBCode(), DetectUnicode() 内で
+ cesi.m_dwStatus に記録する。
+ */
+ SetInformation(pBuf, nBufLen);
+
+ if( GetMetaName() != CODE_NONE ){
+ return GetMetaName();
+ }
+ auto nret = DetectUnicode( this );
+ if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){
+ return nret;
+ }
+ nret = DetectMBCode( this );
+ if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){
+ return nret;
+ }
+
+ // デフォルト文字コードを返す
+ return m_pEncodingConfig->m_eDefaultCodetype;
+}
+
#ifdef _DEBUG
/*!
@@ -1152,8 +1249,8 @@ void CESI::GetDebugInfo( const char* pS, const int nLen, CNativeW* pcmtxtOut )
CESI cesi( doc.m_cDocType.GetDocumentAttribute().m_encoding );
// テスト実行
- cesi.SetInformation( pS, nLen/*, CODE_SJIS*/ );
- ecode_result = CCodeMediator::CheckKanjiCode( &cesi );
+ ecode_result = cesi.CheckKanjiCode(pS, nLen);
+ ecode_result = CODE_ERROR;
//
// 判別結果を分析
diff --git a/sakura_core/charset/CESI.h b/sakura_core/charset/CESI.h
index 664100704d..3cbb808d49 100644
--- a/sakura_core/charset/CESI.h
+++ b/sakura_core/charset/CESI.h
@@ -91,10 +91,14 @@ class CESI {
m_eMetaName = CODE_NONE;
}
- //! 調査結果の情報を格納
- void SetInformation( const char *pS, const int nLen );
+ //! 日本語コードセット判定
+ ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
protected:
+ ECodeType DetectUnicodeBom(const char* pS, size_t nLen) noexcept;
+
+ //! 調査結果の情報を格納
+ void SetInformation( const char *pS, const int nLen );
//! 添え字に使われる優先順位表を作成
void InitPriorityTable( void );
@@ -216,4 +220,49 @@ class CESI {
#endif
};
+/*!
+ 文字列の先頭にUnicode系BOMが付いているか?
+
+ @retval CODE_UNICODE UTF-16 LE
+ @retval CODE_UTF8 UTF-8
+ @retval CODE_UNICODEBE UTF-16 BE
+ @retval CODE_NONE 未検出
+
+ @date 2007.08.11 charcode.cpp から移動
+ @date 2015.03.05 Moca UTF-7 BOMは無効に変更
+ */
+inline
+ECodeType CESI::DetectUnicodeBom(const char* buff, size_t size) noexcept
+{
+ // バイト列がない、または、BOM表現を格納できるサイズに満たない場合、判定をスキップ
+ if (!buff || size < 2) return CODE_NONE;
+
+ // バイト列の先頭が \ufeff の utf8 表現と一致するか判定
+ constexpr const BYTE utf8BOM[]{ 0xef, 0xbb, 0xbf };
+ if (size >= _countof(utf8BOM) && 0 == ::memcmp(buff, utf8BOM, _countof(utf8BOM))) {
+ return CODE_UTF8;
+ }
+
+ // バイト列の先頭が \ufeff の utf16BE 表現と一致するか判定
+ constexpr const BYTE utf16BeBOM[]{ 0xfe, 0xff };
+ if (size >= _countof(utf16BeBOM) && 0 == ::memcmp(buff, utf16BeBOM, _countof(utf16BeBOM))) {
+ return CODE_UNICODEBE;
+ }
+
+ // バイト列の先頭が \ufeff の utf16LE 表現と一致するか判定
+ constexpr const BYTE utf16LeBOM[]{ 0xff, 0xfe };
+ if (size >= _countof(utf16LeBOM) && 0 == ::memcmp(buff, utf16LeBOM, _countof(utf16LeBOM))) {
+ return CODE_UNICODE;
+ }
+
+ // UTF-7 は ASCII 7bit 文字 でない文字を UTF-16BE で符号化してから 修正BASE64 で 符号化する。
+ // Base64 の符号化は 6bit単位 なので BOM に続く文字が非7bit文字な場合、4バイト目がブレる。
+ // このため、 UTF-7 については BOM による判別ロジック省略の対象から外している。
+ //
+ // (BOM)abc ⇒ (UTF-7変換) ⇒ +/v8-abc
+ // (BOM)アイウ ⇒ (UTF-7変換) ⇒ +/v//cf9y/3M-
+
+ return CODE_NONE;
+}
+
/*[EOF]*/
diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp
new file mode 100644
index 0000000000..0979e4b91f
--- /dev/null
+++ b/sakura_core/charset/icu4c/CharsetDetector.cpp
@@ -0,0 +1,77 @@
+/*! @file */
+/*
+ Copyright (C) 2018-2019 Sakura Editor Organization
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented;
+ you must not claim that you wrote the original software.
+ If you use this software in a product, an acknowledgment
+ in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such,
+ and must not be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+ distribution.
+*/
+#include "StdAfx.h"
+#include "CharsetDetector.h"
+
+CharsetDetector::CharsetDetector() noexcept
+ : _icuin()
+ , _csd(nullptr)
+{
+ _icuin.InitDll();
+}
+
+CharsetDetector::~CharsetDetector() noexcept
+{
+ if (_icuin.IsAvailable()) {
+ _icuin.ucsdet_close(_csd);
+ }
+}
+
+ECodeType CharsetDetector::Detect(const std::string_view& bytes)
+{
+ UErrorCode status = U_ZERO_ERROR;
+
+ _csd = _icuin.ucsdet_open(&status);
+ if (status != U_ZERO_ERROR) {
+ return CODE_ERROR;
+ }
+
+ _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status);
+ if (status != U_ZERO_ERROR) {
+ return CODE_ERROR;
+ }
+
+ const auto csm = _icuin.ucsdet_detect(_csd, &status);
+ if (status != U_ZERO_ERROR) {
+ return CODE_ERROR;
+ }
+
+ std::string_view name = _icuin.ucsdet_getName(csm, &status);
+ if (status != U_ZERO_ERROR) {
+ return CODE_ERROR;
+ }
+
+ // 文字セット名⇒サクラエディタ内部コードの変換
+ if (name == "UTF-8") return CODE_UTF8;
+ if (name == "SHIFT_JIS") return CODE_SJIS;
+ if (name == "UTF-16BE") return CODE_UNICODEBE;
+ if (name == "UTF-16LE") return CODE_UNICODE;
+ if (name == "EUC-JP") return CODE_EUC;
+ if (name == "ISO-2022-JP") return CODE_JIS;
+ if (name == "UTF-7") return CODE_UTF7;
+ if (name == "ISO-8859-1") return CODE_LATIN1;
+
+ return CODE_ERROR;
+}
diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h
new file mode 100644
index 0000000000..e43915a4d0
--- /dev/null
+++ b/sakura_core/charset/icu4c/CharsetDetector.h
@@ -0,0 +1,48 @@
+/*! @file */
+/*
+ Copyright (C) 2018-2019 Sakura Editor Organization
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented;
+ you must not claim that you wrote the original software.
+ If you use this software in a product, an acknowledgment
+ in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such,
+ and must not be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+ distribution.
+*/
+#pragma once
+
+#include
+
+#include "extmodule/CIcu4cI18n.h"
+
+/*!
+ * @brief 文字コード検出クラス
+ */
+class CharsetDetector final
+{
+ CIcu4cI18n _icuin;
+ UCharsetDetector* _csd;
+
+public:
+ CharsetDetector() noexcept;
+ ~CharsetDetector() noexcept;
+
+ bool IsAvailable() const noexcept {
+ return _icuin.IsAvailable();
+ }
+
+ ECodeType Detect(const std::string_view& bytes);
+};
diff --git a/sakura_core/extmodule/CIcu4cI18n.cpp b/sakura_core/extmodule/CIcu4cI18n.cpp
new file mode 100644
index 0000000000..8dd3add3ec
--- /dev/null
+++ b/sakura_core/extmodule/CIcu4cI18n.cpp
@@ -0,0 +1,69 @@
+/*! @file */
+/*
+ Copyright (C) 2018-2019 Sakura Editor Organization
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented;
+ you must not claim that you wrote the original software.
+ If you use this software in a product, an acknowledgment
+ in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such,
+ and must not be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+ distribution.
+*/
+#include "StdAfx.h"
+#include "CIcu4cI18n.h"
+
+CIcu4cI18n::CIcu4cI18n() noexcept
+ : _ucsdet_open(nullptr)
+ , _ucsdet_setText(nullptr)
+ , _ucsdet_detect(nullptr)
+ , _ucsdet_close(nullptr)
+{
+}
+
+CIcu4cI18n::~CIcu4cI18n() noexcept
+{
+}
+
+/*!
+ * @brief DLLの名前を返す
+ */
+LPCWSTR CIcu4cI18n::GetDllNameImp(int index)
+{
+ (void*)index;
+ return L"icuin66.dll"; //バージョンは固定
+}
+
+/*!
+ DLLの初期化
+
+ 関数のアドレスを取得してメンバに保管する.
+
+ @retval true 成功
+ @retval false アドレス取得に失敗
+*/
+bool CIcu4cI18n::InitDllImp()
+{
+ //DLL内関数名リスト
+ const ImportTable table[] = {
+ { &_ucsdet_open, "ucsdet_open_66" }, //バージョンは固定
+ { &_ucsdet_setText, "ucsdet_setText_66" }, //バージョンは固定
+ { &_ucsdet_detect, "ucsdet_detect_66" }, //バージョンは固定
+ { &_ucsdet_getName, "ucsdet_getName_66" }, //バージョンは固定
+ { &_ucsdet_close, "ucsdet_close_66" }, //バージョンは固定
+ { NULL, 0 }
+ };
+ return RegisterEntries(table);
+}
diff --git a/sakura_core/extmodule/CIcu4cI18n.h b/sakura_core/extmodule/CIcu4cI18n.h
new file mode 100644
index 0000000000..c05f0d6486
--- /dev/null
+++ b/sakura_core/extmodule/CIcu4cI18n.h
@@ -0,0 +1,81 @@
+/*! @file */
+/*
+ Copyright (C) 2018-2019 Sakura Editor Organization
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented;
+ you must not claim that you wrote the original software.
+ If you use this software in a product, an acknowledgment
+ in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such,
+ and must not be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+ distribution.
+*/
+#pragma once
+
+#include "CDllHandler.h"
+
+//ICU4Cの型定義
+class UCharsetDetector;
+class UCharsetMatch;
+
+typedef enum UErrorCode {
+ U_ZERO_ERROR = 0, /**< No error, no warning. */
+} UErrorCode;
+
+/*!
+ * ICU4C の i18n ライブラリ(icuin.dll) をラップするクラス
+ */
+class CIcu4cI18n final : public CDllImp
+{
+ // DLL関数型定義
+ typedef UCharsetDetector* (__cdecl *ucsdet_open_t)(UErrorCode *status);
+ typedef void (__cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
+ typedef const UCharsetMatch * (__cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status);
+ typedef const char* (__cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status);
+ typedef void (__cdecl *ucsdet_close_t)(UCharsetDetector *ucsd);
+
+ // メンバ定義
+ ucsdet_open_t _ucsdet_open;
+ ucsdet_setText_t _ucsdet_setText;
+ ucsdet_detect_t _ucsdet_detect;
+ ucsdet_getName_t _ucsdet_getName;
+ ucsdet_close_t _ucsdet_close;
+
+public:
+ CIcu4cI18n() noexcept;
+ virtual ~CIcu4cI18n() noexcept;
+
+protected:
+ // CDllImpインタフェース
+ LPCWSTR GetDllNameImp(int nIndex) override;
+ bool InitDllImp() override;
+
+public:
+ inline UCharsetDetector* ucsdet_open(UErrorCode *status) const {
+ return _ucsdet_open(status);
+ }
+ inline void ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) const {
+ return _ucsdet_setText(ucsd, textIn, len, status);
+ }
+ inline const UCharsetMatch* ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) const {
+ return _ucsdet_detect(ucsd, status);
+ }
+ inline const char* ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) const {
+ return _ucsdet_getName(ucsm, status);
+ }
+ inline void ucsdet_close(UCharsetDetector *ucsd) const {
+ return _ucsdet_close(ucsd);
+ }
+};
diff --git a/sakura_core/io/CFileLoad.cpp b/sakura_core/io/CFileLoad.cpp
index 45b81065bc..f30383c170 100644
--- a/sakura_core/io/CFileLoad.cpp
+++ b/sakura_core/io/CFileLoad.cpp
@@ -157,7 +157,6 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC
{
HANDLE hFile;
ULARGE_INTEGER fileSize;
- ECodeType nBomCode;
// FileCloseを呼んでからにしてください
if( NULL != m_hFile ){
@@ -203,14 +202,9 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC
// データ読み込み
Buffering();
- nBomCode = CCodeMediator::DetectUnicodeBom( m_pReadBuf, m_nReadDataLen );
if( CharCode == CODE_AUTODETECT ){
- if( nBomCode != CODE_NONE ){
- CharCode = nBomCode;
- }else{
- CCodeMediator mediator(*m_pEencoding);
- CharCode = mediator.CheckKanjiCode( m_pReadBuf, m_nReadDataLen );
- }
+ CCodeMediator mediator(*m_pEencoding);
+ CharCode = mediator.CheckKanjiCode(m_pReadBuf, m_nReadDataLen);
}
// To Here Jun. 08, 2003
// 不正な文字コードのときはデフォルト(SJIS:無変換)を設定