From f52c5a06112332d3a371670d4dadacdd0e651e63 Mon Sep 17 00:00:00 2001 From: hetao Date: Wed, 19 Apr 2017 19:12:30 +0800 Subject: [PATCH 01/14] support scsw segmenter for chinese --- src/sphinx.cpp | 127 +++++++++++++++++++++++++++++++++++++++++++- src/sphinx.h | 12 +++++ src/sphinxutils.cpp | 14 +++++ src/sphinxutils.h | 1 + 4 files changed, 152 insertions(+), 2 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 9f0809638..c9d6afca6 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2575,6 +2575,25 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8 }; +/// SCWS tokenizer +template < bool IS_QUERY > +class CSphTokenizer_SCWS : public CSphTokenizerBase2 +{ +public: + CSphTokenizer_SCWS (); + ~CSphTokenizer_SCWS (); + virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); + virtual BYTE * GetToken (); + virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; + virtual int GetCodepointLength ( int iCode ) const; + virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } + + + scws_t s; + scws_res_t cur; +}; + + struct CSphNormalForm { CSphString m_sForm; @@ -3792,6 +3811,11 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer () { return new CSphTokenizer_UTF8Ngram (); } +ISphTokenizer * sphCreateUTF8SCWSTokenizer () +{ + return new CSphTokenizer_SCWS (); +} + ///////////////////////////////////////////////////////////////////////////// @@ -4389,7 +4413,8 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett return true; tSettings.m_iType = tReader.GetByte (); - if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM ) + + if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_SCWS) { sWarning = "can't load an old index with SBCS tokenizer"; return false; @@ -4717,10 +4742,13 @@ void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings ) ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError ) { CSphScopedPtr pTokenizer ( NULL ); - + switch ( tSettings.m_iType ) { case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break; + + case TOKENIZER_SCWS: pTokenizer = sphCreateUTF8SCWSTokenizer (); break; + case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break; default: sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType ); @@ -6414,6 +6442,101 @@ BYTE * CSphTokenizer_UTF8Ngram::GetToken () return CSphTokenizer_UTF8::GetToken (); } + +///////////////////////////////////////////////////////////////////////////// + +template < bool IS_QUERY > +CSphTokenizer_SCWS::CSphTokenizer_SCWS () +{ + s = scws_new(); +} +template < bool IS_QUERY > +CSphTokenizer_SCWS::~CSphTokenizer_SCWS () +{ + scws_free_result(cur); + scws_free(s); +} + + +template < bool IS_QUERY > +void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength ) +{ + // check that old one is over and that new length is sane + assert ( iLength>=0 ); + + if ( !m_tSettings.m_scwsDict.IsEmpty () ) + { + scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + } + if ( !m_tSettings.m_scwsRule.IsEmpty ()) + { + scws_set_rule(s, m_tSettings.m_scwsDict.cstr ()); + } + scws_set_charset(s, "utf8"); + + + if ( m_tSettings.m_scwsMulti) + { + scws_set_multi(s, m_tSettings.m_scwsMulti << 12); + }else{ + scws_set_multi(s, 0); + } + m_pBuffer = sBuffer; + scws_send_text(s, (char*)m_pBuffer, iLength); +} + + +template < bool IS_QUERY > +BYTE * CSphTokenizer_SCWS::GetToken () +{ + if (cur == NULL) + { + cur = scws_get_result(s); + if(cur == NULL){ + return NULL; + } + } + memcpy(m_sAccum, m_pBuffer+cur->off, cur->len); + m_sAccum[cur->len]='\0'; + m_pCur += cur->off; + cur = cur->next; + return m_sAccum; + +} + +template < bool IS_QUERY > +ISphTokenizer * CSphTokenizer_SCWS::Clone ( ESphTokenizerClone eMode ) const +{ + if ( eMode!=SPH_CLONE_INDEX ) { + CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); + pClone->CloneBase ( this, eMode ); + return pClone; + + } else { + CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); + pClone->CloneBase ( this, eMode ); + return pClone; + } +} + + +template < bool IS_QUERY > +int CSphTokenizer_SCWS::GetCodepointLength ( int iCode ) const +{ + if ( iCode<128 ) + return 1; + + int iBytes = 0; + while ( iCode & 0x80 ) + { + iBytes++; + iCode <<= 1; + } + + assert ( iBytes>=2 && iBytes<=4 ); + return iBytes; +} + ////////////////////////////////////////////////////////////////////////// CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer ) diff --git a/src/sphinx.h b/src/sphinx.h index 1df08bcc8..7ae826155 100644 --- a/src/sphinx.h +++ b/src/sphinx.h @@ -66,6 +66,10 @@ #include #endif +#if USE_SCWS +#include +#endif + #if USE_WINDOWS typedef __int64 SphOffset_t; #define STDOUT_FILENO fileno(stdout) @@ -498,6 +502,14 @@ struct CSphTokenizerSettings CSphString m_sNgramChars; CSphString m_sBlendChars; CSphString m_sBlendMode; + + + + CSphString m_scwsDict; + CSphString m_scwsRule; + int m_scwsMulti; + + CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output CSphTokenizerSettings (); diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp index d5db1f6a0..c6dc8cbc1 100644 --- a/src/sphinxutils.cpp +++ b/src/sphinxutils.cpp @@ -592,6 +592,12 @@ static KeyDesc_t g_dKeysIndex[] = { "rlp_context", 0, NULL }, { "ondisk_attrs", 0, NULL }, { "index_token_filter", 0, NULL }, + + { "scws", 0, NULL }, + { "scws_dict", 0, NULL }, + { "scws_rule", 0, NULL }, + { "scws_multi", 0, NULL }, + { NULL, 0, NULL } }; @@ -1264,6 +1270,14 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" ); } + if ( hIndex ( "scws" ) ) + { + tSettings.m_iType = TOKENIZER_SCWS; + tSettings.m_scwsDict = hIndex.GetStr ( "scws_dict" ); + tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" ); + tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 ); + } + tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" ); tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 ); tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" ); diff --git a/src/sphinxutils.h b/src/sphinxutils.h index f6757b362..b5b82e65c 100644 --- a/src/sphinxutils.h +++ b/src/sphinxutils.h @@ -147,6 +147,7 @@ enum // where was TOKENIZER_SBCS=1 once TOKENIZER_UTF8 = 2, TOKENIZER_NGRAM = 3 + ,TOKENIZER_SCWS = 4 }; /// load config file From bb75ac91e55f8c83c52daf710b20d1baa849e78d Mon Sep 17 00:00:00 2001 From: hetao Date: Wed, 19 Apr 2017 20:24:06 +0800 Subject: [PATCH 02/14] fix not support scws compile error --- configure | 48 +++++++++++++++++++++++++++++++++++++++++++++ configure.ac | 22 +++++++++++++++++++++ src/Makefile.am | 14 ++++++++++--- src/Makefile.in | 11 ++++++++--- src/sphinx.cpp | 33 +++++++++++++++++++++++-------- src/sphinx.h | 4 ++-- src/sphinxutils.cpp | 8 ++++---- src/sphinxutils.h | 2 ++ 8 files changed, 122 insertions(+), 20 deletions(-) diff --git a/configure b/configure index 10bf6a600..56e2107af 100755 --- a/configure +++ b/configure @@ -608,6 +608,8 @@ LTLIBOBJS CONFDIR USE_RLP_FALSE USE_RLP_TRUE +USE_SCWS_FALSE +USE_SCWS_TRUE USE_RE2_FALSE USE_RE2_TRUE LIBRE2_PATH @@ -653,6 +655,7 @@ DEPDIR OBJEXT EXEEXT ac_ct_CC +ac_cv_use_scws CPPFLAGS LDFLAGS CFLAGS @@ -745,6 +748,7 @@ with_re2 with_re2_includes with_re2_libs with_rlp +with_scws with_iconv with_unixodbc enable_mem_override @@ -1417,6 +1421,8 @@ Optional Packages: --with-re2-libs path to RE2 libraries --with-rlp compile with RLP library support (default is disabled) + --with-scws compile with scws library support (default is + disabled) --with-iconv compile with iconv support (default is autodetect) --with-unixodbc compile with UnixODBC support (default is autodetect) @@ -8222,6 +8228,44 @@ fi +# Check whether --with-scws was given. +if test "${with_scws+set}" = set; then : + withval=$with_scws; ac_cv_use_scws=$withval +else + ac_cv_use_scws=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile with scws library support" >&5 +$as_echo_n "checking whether to compile with scws library support... " >&6; } +if test x$ac_cv_use_scws != xno; then + if test -d $withval && test -f $withval/include/scws/scws.h; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define USE_SCWS 1" >>confdefs.h + + else + as_fn_error $? "missing SCWS sources from libscws" "$LINENO" 5 + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + +$as_echo "#define USE_SCWS 0" >>confdefs.h + +fi + if test x$ac_cv_use_scws != xno; then + USE_SCWS_TRUE= + USE_SCWS_FALSE='#' +else + USE_SCWS_TRUE='#' + USE_SCWS_FALSE= +fi + + + got_expat=0 dl_expat=0 @@ -9102,6 +9146,10 @@ if test -z "${USE_RLP_TRUE}" && test -z "${USE_RLP_FALSE}"; then as_fn_error $? "conditional \"USE_RLP\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${USE_SCWS_TRUE}" && test -z "${USE_SCWS_FALSE}"; then + as_fn_error $? "conditional \"USE_SCWS\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 diff --git a/configure.ac b/configure.ac index 643f5cad3..e65a01f70 100644 --- a/configure.ac +++ b/configure.ac @@ -551,6 +551,28 @@ fi AM_CONDITIONAL(USE_RLP, test x$ac_cv_use_rlp != xno) +dnl --- + +AC_ARG_WITH([scws], + AC_HELP_STRING([--with-scws], [compile with scws library support (default is disabled)]), + [ac_cv_use_scws=$withval], [ac_cv_use_scws=no] +) + +AC_MSG_CHECKING([whether to compile with scws library support]) +if test x$ac_cv_use_scws != xno; then + if test -d $ac_cv_use_scws && test -f $ac_cv_use_scws/include/scws/scws.h; then + AC_MSG_RESULT([yes]) + AC_DEFINE(USE_SCWS, 1, [scws library support]) + else + AC_MSG_ERROR([missing scws sources from libscws]) + fi +else + AC_MSG_RESULT([no]) + AC_DEFINE(USE_SCWS, 0, [scws library support]) +fi +AM_CONDITIONAL(USE_SCWS, test x$ac_cv_use_scws != xno) + + dnl --- got_expat=0 diff --git a/src/Makefile.am b/src/Makefile.am index 4312c9ed7..351ca0982 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,7 +3,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \ sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp -ARFLAGS = crU +ARFLAGS = cr noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) @@ -30,6 +30,14 @@ RLP_LIBS = RLP_INC = endif -AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) +if USE_SCWS +SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws +SCWS_INC = -I@ac_cv_use_scws@/include +else +SCWS_LIBS = +SCWS_INC = +endif + +AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $S(CWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS) LDADD = $(COMMON_LIBS) diff --git a/src/Makefile.in b/src/Makefile.in index 561f08696..5eda13abd 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -180,6 +180,7 @@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ +#ac_cv_use_scws = @ac_cv_use_scws@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ @@ -223,7 +224,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \ sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \ sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp -ARFLAGS = crU +ARFLAGS = cr noinst_LIBRARIES = libsphinx.a libsphinx_a_SOURCES = $(SRC_SPHINX) indexer_SOURCES = indexer.cpp @@ -237,8 +238,12 @@ BUILT_SOURCES = extract-version @USE_RLP_TRUE@RLP_LIBS = -L$(top_srcdir)/rlp/lib/amd64-glibc25-gcc42 -lbtrlpc -lbtrlpcore -lbtutils @USE_RLP_FALSE@RLP_INC = @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT -AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" -COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) +@USE_SCWS_FALSE@SCWS_LIBS = +@USE_SCWS_TRUE@SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws +@USE_SCWS_FALSE@SCWS_INC = +@USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include +AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" +COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(SCWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS) LDADD = $(COMMON_LIBS) all: $(BUILT_SOURCES) $(MAKE) $(AM_MAKEFLAGS) all-am diff --git a/src/sphinx.cpp b/src/sphinx.cpp index c9d6afca6..9058ca609 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2576,6 +2576,7 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8 /// SCWS tokenizer +#if USE_SCWS template < bool IS_QUERY > class CSphTokenizer_SCWS : public CSphTokenizerBase2 { @@ -2588,11 +2589,12 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 virtual int GetCodepointLength ( int iCode ) const; virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } - + scws_t s; scws_res_t cur; -}; +}; +#endif struct CSphNormalForm { @@ -3811,11 +3813,13 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer () { return new CSphTokenizer_UTF8Ngram (); } + +#if USE_SCWS ISphTokenizer * sphCreateUTF8SCWSTokenizer () { return new CSphTokenizer_SCWS (); } - +#endif ///////////////////////////////////////////////////////////////////////////// @@ -4414,8 +4418,13 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett tSettings.m_iType = tReader.GetByte (); - if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_SCWS) - { + if ( + tSettings.m_iType!=TOKENIZER_UTF8 + && tSettings.m_iType!=TOKENIZER_NGRAM +#if USE_SCWS + && tSettings.m_iType!=TOKENIZER_SCWS +#endif + ){ sWarning = "can't load an old index with SBCS tokenizer"; return false; } @@ -4746,9 +4755,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, switch ( tSettings.m_iType ) { case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break; - +#if USE_SCWS case TOKENIZER_SCWS: pTokenizer = sphCreateUTF8SCWSTokenizer (); break; - +#endif case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break; default: sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType ); @@ -6445,16 +6454,24 @@ BYTE * CSphTokenizer_UTF8Ngram::GetToken () ///////////////////////////////////////////////////////////////////////////// +#if USE_SCWS + + + template < bool IS_QUERY > CSphTokenizer_SCWS::CSphTokenizer_SCWS () { + s = scws_new(); + } template < bool IS_QUERY > CSphTokenizer_SCWS::~CSphTokenizer_SCWS () { + scws_free_result(cur); scws_free(s); + } @@ -6536,7 +6553,7 @@ int CSphTokenizer_SCWS::GetCodepointLength ( int iCode ) const assert ( iBytes>=2 && iBytes<=4 ); return iBytes; } - +#endif ////////////////////////////////////////////////////////////////////////// CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer ) diff --git a/src/sphinx.h b/src/sphinx.h index 7ae826155..efafcfc0a 100644 --- a/src/sphinx.h +++ b/src/sphinx.h @@ -504,11 +504,11 @@ struct CSphTokenizerSettings CSphString m_sBlendMode; - + #if USE_SCWS CSphString m_scwsDict; CSphString m_scwsRule; int m_scwsMulti; - +#endif CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp index c6dc8cbc1..caf80da2e 100644 --- a/src/sphinxutils.cpp +++ b/src/sphinxutils.cpp @@ -592,12 +592,12 @@ static KeyDesc_t g_dKeysIndex[] = { "rlp_context", 0, NULL }, { "ondisk_attrs", 0, NULL }, { "index_token_filter", 0, NULL }, - +#if USE_SCWS { "scws", 0, NULL }, { "scws_dict", 0, NULL }, { "scws_rule", 0, NULL }, { "scws_multi", 0, NULL }, - +#endif { NULL, 0, NULL } }; @@ -1269,7 +1269,7 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings else sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" ); } - +#if USE_SCWS if ( hIndex ( "scws" ) ) { tSettings.m_iType = TOKENIZER_SCWS; @@ -1277,7 +1277,7 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" ); tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 ); } - +#endif tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" ); tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 ); tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" ); diff --git a/src/sphinxutils.h b/src/sphinxutils.h index b5b82e65c..e92114ef3 100644 --- a/src/sphinxutils.h +++ b/src/sphinxutils.h @@ -147,7 +147,9 @@ enum // where was TOKENIZER_SBCS=1 once TOKENIZER_UTF8 = 2, TOKENIZER_NGRAM = 3 +#if USE_SCWS ,TOKENIZER_SCWS = 4 +#endif }; /// load config file From 3d99dcbbb496e53ce595f0a3ac1a8a27925b80b1 Mon Sep 17 00:00:00 2001 From: hetao Date: Wed, 19 Apr 2017 20:52:22 +0800 Subject: [PATCH 03/14] fix USE_SCWS defined miss --- config/config.h.in | 3 +++ src/Makefile.in | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/config/config.h.in b/config/config.h.in index 7eac198a4..6d53f6a65 100644 --- a/config/config.h.in +++ b/config/config.h.in @@ -288,6 +288,9 @@ /* RLP library support */ #undef USE_RLP +/* SCWS library support */ +#undef USE_SCWS + /* define to use POSIX Syslog for logging */ #undef USE_SYSLOG diff --git a/src/Makefile.in b/src/Makefile.in index 5eda13abd..83b69d8f6 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -180,7 +180,6 @@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ -#ac_cv_use_scws = @ac_cv_use_scws@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ From 02aa41cf01a2b68729efb781cbe0e36d7c81fe2b Mon Sep 17 00:00:00 2001 From: hetao Date: Wed, 19 Apr 2017 21:22:39 +0800 Subject: [PATCH 04/14] fix SetCaseFolding --- src/sphinx.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 9058ca609..11d950122 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -6462,7 +6462,10 @@ template < bool IS_QUERY > CSphTokenizer_SCWS::CSphTokenizer_SCWS () { - s = scws_new(); + CSphString sTmp; + SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); + m_bHasBlend = false; + s = scws_new(); } template < bool IS_QUERY > From c5d4b7e7c1c39c869d50d921a4d2a5203112adcb Mon Sep 17 00:00:00 2001 From: hetao Date: Thu, 20 Apr 2017 19:35:15 +0800 Subject: [PATCH 05/14] fix scws_set_ignore default true --- src/sphinx.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 9058ca609..cf71ccbeb 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -6490,6 +6490,7 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength scws_set_rule(s, m_tSettings.m_scwsDict.cstr ()); } scws_set_charset(s, "utf8"); + scws_set_ignore(s, true); if ( m_tSettings.m_scwsMulti) From d5b39bd7921f618c2908d36ed9d17cd0fb3abfd5 Mon Sep 17 00:00:00 2001 From: hetao Date: Fri, 21 Apr 2017 09:55:59 +0800 Subject: [PATCH 06/14] static link --- src/Makefile.am | 2 +- src/Makefile.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 351ca0982..0874cf503 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -31,7 +31,7 @@ RLP_INC = endif if USE_SCWS -SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws +SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a SCWS_INC = -I@ac_cv_use_scws@/include else SCWS_LIBS = diff --git a/src/Makefile.in b/src/Makefile.in index 83b69d8f6..491381fc7 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -238,7 +238,7 @@ BUILT_SOURCES = extract-version @USE_RLP_FALSE@RLP_INC = @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT @USE_SCWS_FALSE@SCWS_LIBS = -@USE_SCWS_TRUE@SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws +@USE_SCWS_TRUE@SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a @USE_SCWS_FALSE@SCWS_INC = @USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\"" From 7308c936dda88c63e5091814dca28b6b7548ccab Mon Sep 17 00:00:00 2001 From: hetao Date: Fri, 21 Apr 2017 10:56:06 +0800 Subject: [PATCH 07/14] fix set in setting --- src/sphinx.cpp | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 97858c006..23ce7e18b 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2586,12 +2586,32 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); virtual BYTE * GetToken (); virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; + virtual void Setup ( const CSphTokenizerSettings & tSettings ) { + CSphTokenizerBase2::Setup ( tSettings ); + if ( !m_tSettings.m_scwsDict.IsEmpty () ) + { + scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + } + if ( !m_tSettings.m_scwsRule.IsEmpty ()) + { + scws_set_rule(s, m_tSettings.m_scwsDict.cstr ()); + } + scws_set_charset(s, "utf8"); + scws_set_ignore(s, true); + + + if ( m_tSettings.m_scwsMulti) + { + scws_set_multi(s, m_tSettings.m_scwsMulti << 12); + }else{ + scws_set_multi(s, 0); + } + } virtual int GetCodepointLength ( int iCode ) const; virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } - - scws_t s; - scws_res_t cur; + scws_t s; + scws_res_t cur; }; #endif @@ -6484,24 +6504,6 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength // check that old one is over and that new length is sane assert ( iLength>=0 ); - if ( !m_tSettings.m_scwsDict.IsEmpty () ) - { - scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); - } - if ( !m_tSettings.m_scwsRule.IsEmpty ()) - { - scws_set_rule(s, m_tSettings.m_scwsDict.cstr ()); - } - scws_set_charset(s, "utf8"); - scws_set_ignore(s, true); - - - if ( m_tSettings.m_scwsMulti) - { - scws_set_multi(s, m_tSettings.m_scwsMulti << 12); - }else{ - scws_set_multi(s, 0); - } m_pBuffer = sBuffer; scws_send_text(s, (char*)m_pBuffer, iLength); } From e2484ed4cb85fd0e76ced631f50888424986ff4f Mon Sep 17 00:00:00 2001 From: hetao Date: Mon, 24 Apr 2017 12:26:58 +0800 Subject: [PATCH 08/14] fix query bug of scws gettoken --- src/sphinx.cpp | 318 +++++++++++++++++++++++++++++++++++++++++++++---- src/sphinx.h | 1 + 2 files changed, 293 insertions(+), 26 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 23ce7e18b..dd3bd09b0 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2588,30 +2588,31 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; virtual void Setup ( const CSphTokenizerSettings & tSettings ) { CSphTokenizerBase2::Setup ( tSettings ); - if ( !m_tSettings.m_scwsDict.IsEmpty () ) + if ( !tSettings.m_scwsDict.IsEmpty () ) { - scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); } - if ( !m_tSettings.m_scwsRule.IsEmpty ()) + if ( !tSettings.m_scwsRule.IsEmpty ()) { - scws_set_rule(s, m_tSettings.m_scwsDict.cstr ()); + scws_set_rule(s, tSettings.m_scwsDict.cstr ()); } scws_set_charset(s, "utf8"); scws_set_ignore(s, true); - if ( m_tSettings.m_scwsMulti) + if ( tSettings.m_scwsMulti) { - scws_set_multi(s, m_tSettings.m_scwsMulti << 12); + scws_set_multi(s, tSettings.m_scwsMulti << 12); }else{ scws_set_multi(s, 0); } } virtual int GetCodepointLength ( int iCode ) const; virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } + const BYTE * m_pText; scws_t s; - scws_res_t cur; + scws_res_t res,cur; }; #endif @@ -4475,6 +4476,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett tSettings.m_sBlendChars = tReader.GetString (); if ( uVersion>=24 ) tSettings.m_sBlendMode = tReader.GetString(); +#if USE_SCWS + tSettings.m_scwsMulti= tReader.GetDword(); + tSettings.m_scwsDict= tReader.GetString(); + tSettings.m_scwsRule= tReader.GetString(); +#endif return true; } @@ -4504,6 +4510,11 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i tWriter.PutString ( tSettings.m_sNgramChars.cstr () ); tWriter.PutString ( tSettings.m_sBlendChars.cstr () ); tWriter.PutString ( tSettings.m_sBlendMode.cstr () ); +#if USE_SCWS + tWriter.PutDword( tSettings.m_scwsMulti); + tWriter.PutString ( tSettings.m_scwsDict.cstr()) ; + tWriter.PutString ( tSettings.m_scwsRule.cstr()); +#endif } @@ -6491,10 +6502,7 @@ CSphTokenizer_SCWS::CSphTokenizer_SCWS () template < bool IS_QUERY > CSphTokenizer_SCWS::~CSphTokenizer_SCWS () { - - scws_free_result(cur); scws_free(s); - } @@ -6503,28 +6511,285 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength { // check that old one is over and that new length is sane assert ( iLength>=0 ); + + // set buffer + m_pTokenStart = m_pTokenEnd = NULL; + m_pBlendStart = m_pBlendEnd = NULL; + + m_pText = m_pBuffer = sBuffer; + m_pBufferMax = sBuffer + iLength; + m_pCur = sBuffer; + + m_iOvershortCount = 0; + m_bBoundary = m_bTokenBoundary = false; - m_pBuffer = sBuffer; - scws_send_text(s, (char*)m_pBuffer, iLength); + res = cur = NULL; + scws_send_text(s, (char*)m_pText, iLength); } template < bool IS_QUERY > BYTE * CSphTokenizer_SCWS::GetToken () { - if (cur == NULL) - { - cur = scws_get_result(s); - if(cur == NULL){ - return NULL; - } - } - memcpy(m_sAccum, m_pBuffer+cur->off, cur->len); - m_sAccum[cur->len]='\0'; - m_pCur += cur->off; - cur = cur->next; - return m_sAccum; - + m_bWasSpecial = false; + m_bBlended = false; + m_iOvershortCount = 0; + m_bTokenBoundary = false; + m_bWasSynonym = false; + if( m_bHasBlend) + { + BYTE * pVar = GetBlendedVariant (); + if ( pVar ) + return pVar; + m_bBlendedPart = ( m_pBlendEnd!=NULL ); + } + + bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases + bool bGotSoft = false; // hey Beavis he said soft huh huhhuh + + m_pTokenStart = NULL; + for ( ;; ) + { + // get next codepoint + const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already + + if(cur !=NULL){ + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + cur = cur->next; + return m_sAccum; + } + m_pText = m_pCur; + + + int iCodePoint; + int iCode; + if ( pCuroff, cur->len); + m_sAccum[cur->len]='\0'; + + m_pTokenStart = m_pText + cur->off; + m_pCur = m_pText + cur->off + cur->len; + m_pTokenEnd = m_pCur; + + cur = cur->next; + if(cur == NULL){ + m_iLastTokenLen = 0; + m_iAccum = 0; + scws_free_result(res); + } + return m_sAccum; + } + } } template < bool IS_QUERY > @@ -6533,11 +6798,12 @@ ISphTokenizer * CSphTokenizer_SCWS::Clone ( ESphTokenizerClone eMode ) if ( eMode!=SPH_CLONE_INDEX ) { CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); pClone->CloneBase ( this, eMode ); + pClone->Setup(m_tSettings); return pClone; - } else { CSphTokenizer_SCWS *pClone = new CSphTokenizer_SCWS(); pClone->CloneBase ( this, eMode ); + pClone->Setup(m_tSettings); return pClone; } } diff --git a/src/sphinx.h b/src/sphinx.h index efafcfc0a..384a6222e 100644 --- a/src/sphinx.h +++ b/src/sphinx.h @@ -455,6 +455,7 @@ class CSphLowercaser int m_iChunks; ///< how much chunks are actually allocated int * m_pData; ///< chunks themselves +public: int * m_pChunk [ CHUNK_COUNT ]; ///< pointers to non-empty chunks }; From f004fc7c1d6768c234ae2994a8d7cde71651e1f0 Mon Sep 17 00:00:00 2001 From: hetao Date: Mon, 24 Apr 2017 13:41:59 +0800 Subject: [PATCH 09/14] disable memory dict --- src/sphinx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index dd3bd09b0..9daca7aab 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2590,7 +2590,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 CSphTokenizerBase2::Setup ( tSettings ); if ( !tSettings.m_scwsDict.IsEmpty () ) { - scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB); } if ( !tSettings.m_scwsRule.IsEmpty ()) { From 8d6e8325aacf2611758f226440c871c545684fbb Mon Sep 17 00:00:00 2001 From: hetao Date: Mon, 24 Apr 2017 14:05:41 +0800 Subject: [PATCH 10/14] fix optimize performance --- src/sphinx.cpp | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 9daca7aab..e065d57ae 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2577,6 +2577,9 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8 /// SCWS tokenizer #if USE_SCWS +scws_t scws_global; +int scws_config_set=false; + template < bool IS_QUERY > class CSphTokenizer_SCWS : public CSphTokenizerBase2 { @@ -2586,34 +2589,36 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 virtual void SetBuffer ( const BYTE * sBuffer, int iLength ); virtual BYTE * GetToken (); virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const; - virtual void Setup ( const CSphTokenizerSettings & tSettings ) { - CSphTokenizerBase2::Setup ( tSettings ); + virtual void Setup ( const CSphTokenizerSettings & tSettings ){ + CSphTokenizerBase2::Setup ( tSettings ); + if(scws_config_set==false){ + scws_config_set=true; if ( !tSettings.m_scwsDict.IsEmpty () ) { - scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB); + scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); } if ( !tSettings.m_scwsRule.IsEmpty ()) { - scws_set_rule(s, tSettings.m_scwsDict.cstr ()); + scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ()); } - scws_set_charset(s, "utf8"); - scws_set_ignore(s, true); + scws_set_charset(scws_global, "utf8"); + scws_set_ignore(scws_global, true); if ( tSettings.m_scwsMulti) { - scws_set_multi(s, tSettings.m_scwsMulti << 12); + scws_set_multi(scws_global, tSettings.m_scwsMulti << 12); }else{ - scws_set_multi(s, 0); + scws_set_multi(scws_global, 0); } } + scws_source = scws_fork(scws_global); + } virtual int GetCodepointLength ( int iCode ) const; virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); } const BYTE * m_pText; - - scws_t s; - scws_res_t res,cur; - + scws_res_t res,cur; + scws_t scws_source; }; #endif @@ -6496,13 +6501,12 @@ CSphTokenizer_SCWS::CSphTokenizer_SCWS () CSphString sTmp; SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); m_bHasBlend = false; - s = scws_new(); - + if(scws_global==NULL) scws_global = scws_new(); } template < bool IS_QUERY > CSphTokenizer_SCWS::~CSphTokenizer_SCWS () { - scws_free(s); + scws_free(scws_source); } @@ -6524,7 +6528,7 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength m_bBoundary = m_bTokenBoundary = false; res = cur = NULL; - scws_send_text(s, (char*)m_pText, iLength); + scws_send_text(scws_source, (char*)m_pText, iLength); } @@ -6767,8 +6771,8 @@ BYTE * CSphTokenizer_SCWS::GetToken () iCode &= MASK_CODEPOINT; m_iAccum++; - scws_send_text(s, (char*)m_pText, strlen((char*)m_pText)); - res = (cur = scws_get_result(s));//只读取一个单词 + scws_send_text(scws_source, (char*)m_pText, strlen((char*)m_pText)); + res = (cur = scws_get_result(scws_source));//只读取一个单词 if(cur == NULL){ FlushAccum(); return NULL; From 6a32ad0532288ba4d8c4ab22c9e2d7d0e62976c0 Mon Sep 17 00:00:00 2001 From: hetao Date: Mon, 24 Apr 2017 18:33:53 +0800 Subject: [PATCH 11/14] ignore case with scws --- src/sphinx.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index e065d57ae..06f43b1ff 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -6560,6 +6560,7 @@ BYTE * CSphTokenizer_SCWS::GetToken () if(cur !=NULL){ memcpy(m_sAccum, m_pText + cur->off, cur->len); m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); cur = cur->next; return m_sAccum; } @@ -6780,6 +6781,7 @@ BYTE * CSphTokenizer_SCWS::GetToken () memcpy(m_sAccum, pCur+cur->off, cur->len); m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); m_pTokenStart = m_pText + cur->off; m_pCur = m_pText + cur->off + cur->len; From 5dc388c609269787569a54600b97789c013871d8 Mon Sep 17 00:00:00 2001 From: hetao Date: Thu, 27 Apr 2017 16:24:03 +0800 Subject: [PATCH 12/14] fix memory leak --- src/sphinx.cpp | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 06f43b1ff..ac410f061 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -6501,7 +6501,9 @@ CSphTokenizer_SCWS::CSphTokenizer_SCWS () CSphString sTmp; SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp ); m_bHasBlend = false; - if(scws_global==NULL) scws_global = scws_new(); + if(scws_global==NULL) { + scws_global = scws_new(); + } } template < bool IS_QUERY > CSphTokenizer_SCWS::~CSphTokenizer_SCWS () @@ -6513,8 +6515,8 @@ CSphTokenizer_SCWS::~CSphTokenizer_SCWS () template < bool IS_QUERY > void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength ) { - // check that old one is over and that new length is sane - assert ( iLength>=0 ); + // check that old one is over and that new length is sane + assert ( iLength>=0 ); // set buffer m_pTokenStart = m_pTokenEnd = NULL; @@ -6526,9 +6528,9 @@ void CSphTokenizer_SCWS::SetBuffer ( const BYTE * sBuffer, int iLength m_iOvershortCount = 0; m_bBoundary = m_bTokenBoundary = false; - + res = cur = NULL; - scws_send_text(scws_source, (char*)m_pText, iLength); + scws_send_text(scws_source, (char*)m_pText, iLength); } @@ -6558,11 +6560,18 @@ BYTE * CSphTokenizer_SCWS::GetToken () const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already if(cur !=NULL){ - memcpy(m_sAccum, m_pText + cur->off, cur->len); - m_sAccum[cur->len]='\0'; - sphColumnToLowercase ( (char *)( m_sAccum ) ); cur = cur->next; - return m_sAccum; + if(cur != NULL){ + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + return m_sAccum; + }else{ + m_iLastTokenLen = 0; + m_iAccum = 0; + scws_free_result(res); + } + } m_pText = m_pCur; @@ -6787,12 +6796,6 @@ BYTE * CSphTokenizer_SCWS::GetToken () m_pCur = m_pText + cur->off + cur->len; m_pTokenEnd = m_pCur; - cur = cur->next; - if(cur == NULL){ - m_iLastTokenLen = 0; - m_iAccum = 0; - scws_free_result(res); - } return m_sAccum; } } From da1635f7e0b3ede18d45aa9f0cef913e1658b421 Mon Sep 17 00:00:00 2001 From: hetao Date: Thu, 27 Apr 2017 18:05:48 +0800 Subject: [PATCH 13/14] increase index speed --- src/sphinx.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index ac410f061..286e35330 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -6550,6 +6550,25 @@ BYTE * CSphTokenizer_SCWS::GetToken () m_bBlendedPart = ( m_pBlendEnd!=NULL ); } + if(!IS_QUERY){ + if(cur == NULL){ + res = (cur = scws_get_result(scws_source)); + if(cur == NULL){ + return NULL; + } + } + memcpy(m_sAccum, m_pText + cur->off, cur->len); + m_sAccum[cur->len]='\0'; + sphColumnToLowercase ( (char *)( m_sAccum ) ); + m_iLastTokenLen = 0; + m_iAccum = 0; + cur = cur->next; + if(cur == NULL){ + scws_free_result(res); + } + return m_sAccum; + } + bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases bool bGotSoft = false; // hey Beavis he said soft huh huhhuh From 84c2be81f52df3ff9e3beff7f4b3d5b08e57af7d Mon Sep 17 00:00:00 2001 From: hetao Date: Fri, 28 Apr 2017 14:22:08 +0800 Subject: [PATCH 14/14] support multi dict and xdb format --- src/sphinx.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/sphinx.cpp b/src/sphinx.cpp index 286e35330..da5ac169e 100644 --- a/src/sphinx.cpp +++ b/src/sphinx.cpp @@ -2595,11 +2595,27 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 scws_config_set=true; if ( !tSettings.m_scwsDict.IsEmpty () ) { - scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM); + + int mode,ret; + + CSphVector dicts; + + sphSplit ( dicts, tSettings.m_scwsDict.cstr() ," \t,;"); + ARRAY_FOREACH ( i, dicts) + { + mode = SCWS_XDICT_MEM | SCWS_XDICT_XDB; + if (!dicts[i].Ends(".xdb")){ + mode |= SCWS_XDICT_TXT; + } + ret = scws_add_dict(scws_global, dicts[i].cstr (), mode); + sphInfo("scws set dict [%s], mode [%d], ret [%d]",dicts[i].cstr (),mode,ret); + } + } if ( !tSettings.m_scwsRule.IsEmpty ()) { - scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ()); + scws_set_rule(scws_global, tSettings.m_scwsRule.cstr ()); + sphInfo("scws set rule [%s]",tSettings.m_scwsRule.cstr ()); } scws_set_charset(scws_global, "utf8"); scws_set_ignore(scws_global, true); @@ -2608,6 +2624,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2 if ( tSettings.m_scwsMulti) { scws_set_multi(scws_global, tSettings.m_scwsMulti << 12); + sphInfo("scws set muliti[%d]",tSettings.m_scwsMulti); }else{ scws_set_multi(scws_global, 0); }