From f52c5a06112332d3a371670d4dadacdd0e651e63 Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Wed, 19 Apr 2017 19:12:30 +0800
Subject: [PATCH 01/14] support scsw segmenter for chinese

---
 src/sphinx.cpp      | 127 +++++++++++++++++++++++++++++++++++++++++++-
 src/sphinx.h        |  12 +++++
 src/sphinxutils.cpp |  14 +++++
 src/sphinxutils.h   |   1 +
 4 files changed, 152 insertions(+), 2 deletions(-)
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 9f0809638..c9d6afca6 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2575,6 +2575,25 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
 };
 
 
+/// SCWS tokenizer
+template < bool IS_QUERY >
+class CSphTokenizer_SCWS : public CSphTokenizerBase2
+{
+public:
+                                                    CSphTokenizer_SCWS ();
+                                                    ~CSphTokenizer_SCWS ();
+        virtual void                SetBuffer ( const BYTE * sBuffer, int iLength );
+        virtual BYTE *              GetToken ();
+        virtual ISphTokenizer *     Clone ( ESphTokenizerClone eMode ) const;
+        virtual int                 GetCodepointLength ( int iCode ) const;
+        virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+
+      
+    scws_t s; 
+    scws_res_t cur;
+};
+
+
 struct CSphNormalForm
 {
 	CSphString				m_sForm;
@@ -3792,6 +3811,11 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 {
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }
+ISphTokenizer * sphCreateUTF8SCWSTokenizer ()
+{
+        return new CSphTokenizer_SCWS<false> ();
+}
+
 
 /////////////////////////////////////////////////////////////////////////////
 
@@ -4389,7 +4413,8 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		return true;
 
 	tSettings.m_iType = tReader.GetByte ();
-	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
+
+        if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_SCWS)
 	{
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
@@ -4717,10 +4742,13 @@ void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings )
 ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError )
 {
 	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );
-
+        
 	switch ( tSettings.m_iType )
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
+
+		case TOKENIZER_SCWS:	pTokenizer = sphCreateUTF8SCWSTokenizer (); break;
+                
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
@@ -6414,6 +6442,101 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
 }
 
+
+/////////////////////////////////////////////////////////////////////////////
+
+template < bool IS_QUERY >
+CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
+{
+        s = scws_new();
+}
+template < bool IS_QUERY >
+CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
+{
+        scws_free_result(cur);
+        scws_free(s);
+}
+
+
+template < bool IS_QUERY >
+void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
+{
+        // check that old one is over and that new length is sane
+        assert ( iLength>=0 );
+        
+        if ( !m_tSettings.m_scwsDict.IsEmpty ()  )
+	{ 
+            scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+	}
+        if ( !m_tSettings.m_scwsRule.IsEmpty ())
+	{ 
+            scws_set_rule(s, m_tSettings.m_scwsDict.cstr ());
+	}
+        scws_set_charset(s, "utf8");
+        
+        
+        if ( m_tSettings.m_scwsMulti)
+	{ 
+            scws_set_multi(s, m_tSettings.m_scwsMulti << 12);
+	}else{
+            scws_set_multi(s, 0);
+        }
+        m_pBuffer = sBuffer;
+        scws_send_text(s, (char*)m_pBuffer, iLength);
+}
+
+
+template < bool IS_QUERY >
+BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
+{
+        if (cur == NULL)
+        {
+            cur = scws_get_result(s);
+            if(cur == NULL){
+                return NULL;
+            }
+        }
+        memcpy(m_sAccum, m_pBuffer+cur->off, cur->len);
+        m_sAccum[cur->len]='\0';
+        m_pCur += cur->off;
+        cur = cur->next;
+        return m_sAccum;
+        
+}
+
+template < bool IS_QUERY >
+ISphTokenizer * CSphTokenizer_SCWS<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
+{
+        if ( eMode!=SPH_CLONE_INDEX ) {
+                CSphTokenizer_SCWS<true> *pClone = new CSphTokenizer_SCWS<true>();
+                pClone->CloneBase ( this, eMode );
+                return pClone;
+
+        } else {
+                CSphTokenizer_SCWS<false> *pClone = new CSphTokenizer_SCWS<false>();
+                pClone->CloneBase ( this, eMode );
+                return pClone;
+        }
+}
+
+
+template < bool IS_QUERY >
+int CSphTokenizer_SCWS<IS_QUERY>::GetCodepointLength ( int iCode ) const
+{
+        if ( iCode<128 )
+                return 1;
+
+        int iBytes = 0;
+        while ( iCode & 0x80 )
+        {
+                iBytes++;
+                iCode <<= 1;
+        }
+
+        assert ( iBytes>=2 && iBytes<=4 );
+        return iBytes;
+}
+
 //////////////////////////////////////////////////////////////////////////
 
 CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
diff --git a/src/sphinx.h b/src/sphinx.h
index 1df08bcc8..7ae826155 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -66,6 +66,10 @@
 #include <mysql.h>
 #endif
 
+#if USE_SCWS
+#include <scws/scws.h>
+#endif
+
 #if USE_WINDOWS
 typedef __int64				SphOffset_t;
 #define STDOUT_FILENO		fileno(stdout)
@@ -498,6 +502,14 @@ struct CSphTokenizerSettings
 	CSphString			m_sNgramChars;
 	CSphString			m_sBlendChars;
 	CSphString			m_sBlendMode;
+        
+
+        
+        CSphString                      m_scwsDict;
+        CSphString                      m_scwsRule;
+        int                      m_scwsMulti;
+        
+
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
 
 						CSphTokenizerSettings ();
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index d5db1f6a0..c6dc8cbc1 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -592,6 +592,12 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "rlp_context",			0, NULL },
 	{ "ondisk_attrs",			0, NULL },
 	{ "index_token_filter",		0, NULL },
+
+	{ "scws",		0, NULL },
+	{ "scws_dict",		0, NULL },
+	{ "scws_rule",		0, NULL },
+	{ "scws_multi",		0, NULL },
+
 	{ NULL,						0, NULL }
 };
 
@@ -1264,6 +1270,14 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}
 
+        if ( hIndex ( "scws" ) )
+        {
+                tSettings.m_iType = TOKENIZER_SCWS;
+                tSettings.m_scwsDict = hIndex.GetStr ( "scws_dict" );
+                tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" );
+                tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 );
+        }
+
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index f6757b362..b5b82e65c 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,6 +147,7 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+	,TOKENIZER_SCWS	= 4
 };
 
 /// load config file

From bb75ac91e55f8c83c52daf710b20d1baa849e78d Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Wed, 19 Apr 2017 20:24:06 +0800
Subject: [PATCH 02/14] fix not support scws compile error

---
 configure           | 48 +++++++++++++++++++++++++++++++++++++++++++++
 configure.ac        | 22 +++++++++++++++++++++
 src/Makefile.am     | 14 ++++++++++---
 src/Makefile.in     | 11 ++++++++---
 src/sphinx.cpp      | 33 +++++++++++++++++++++++--------
 src/sphinx.h        |  4 ++--
 src/sphinxutils.cpp |  8 ++++----
 src/sphinxutils.h   |  2 ++
 8 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/configure b/configure
index 10bf6a600..56e2107af 100755
--- a/configure
+++ b/configure
@@ -608,6 +608,8 @@ LTLIBOBJS
 CONFDIR
 USE_RLP_FALSE
 USE_RLP_TRUE
+USE_SCWS_FALSE
+USE_SCWS_TRUE
 USE_RE2_FALSE
 USE_RE2_TRUE
 LIBRE2_PATH
@@ -653,6 +655,7 @@ DEPDIR
 OBJEXT
 EXEEXT
 ac_ct_CC
+ac_cv_use_scws
 CPPFLAGS
 LDFLAGS
 CFLAGS
@@ -745,6 +748,7 @@ with_re2
 with_re2_includes
 with_re2_libs
 with_rlp
+with_scws
 with_iconv
 with_unixodbc
 enable_mem_override
@@ -1417,6 +1421,8 @@ Optional Packages:
   --with-re2-libs         path to RE2 libraries
   --with-rlp              compile with RLP library support (default is
                           disabled)
+  --with-scws             compile with scws library support (default is
+                          disabled)
   --with-iconv            compile with iconv support (default is autodetect)
   --with-unixodbc         compile with UnixODBC support (default is
                           autodetect)
@@ -8222,6 +8228,44 @@ fi
 
 
 
+# Check whether --with-scws was given.
+if test "${with_scws+set}" = set; then :
+  withval=$with_scws; ac_cv_use_scws=$withval
+else
+  ac_cv_use_scws=no
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to compile with scws library support" >&5
+$as_echo_n "checking whether to compile with scws library support... " >&6; }
+if test x$ac_cv_use_scws != xno; then
+	if test -d $withval && test -f $withval/include/scws/scws.h; then
+		{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
+$as_echo "#define USE_SCWS 1" >>confdefs.h
+
+	else
+		as_fn_error $? "missing SCWS sources from libscws" "$LINENO" 5
+	fi
+else
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+$as_echo "#define USE_SCWS 0" >>confdefs.h
+
+fi
+ if test x$ac_cv_use_scws != xno; then
+  USE_SCWS_TRUE=
+  USE_SCWS_FALSE='#'
+else
+  USE_SCWS_TRUE='#'
+  USE_SCWS_FALSE=
+fi
+
+
+
 
 got_expat=0
 dl_expat=0
@@ -9102,6 +9146,10 @@ if test -z "${USE_RLP_TRUE}" && test -z "${USE_RLP_FALSE}"; then
   as_fn_error $? "conditional \"USE_RLP\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_SCWS_TRUE}" && test -z "${USE_SCWS_FALSE}"; then
+  as_fn_error $? "conditional \"USE_SCWS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 
 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
diff --git a/configure.ac b/configure.ac
index 643f5cad3..e65a01f70 100644
--- a/configure.ac
+++ b/configure.ac
@@ -551,6 +551,28 @@ fi
 AM_CONDITIONAL(USE_RLP, test x$ac_cv_use_rlp != xno)
 
 
+dnl ---
+
+AC_ARG_WITH([scws],
+	AC_HELP_STRING([--with-scws], [compile with scws library support (default is disabled)]),
+	[ac_cv_use_scws=$withval], [ac_cv_use_scws=no]
+)
+
+AC_MSG_CHECKING([whether to compile with scws library support])
+if test x$ac_cv_use_scws != xno; then
+	if test -d $ac_cv_use_scws && test -f $ac_cv_use_scws/include/scws/scws.h; then
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(USE_SCWS, 1, [scws library support])
+	else
+		AC_MSG_ERROR([missing scws sources from libscws])
+	fi
+else
+	AC_MSG_RESULT([no])
+	AC_DEFINE(USE_SCWS, 0, [scws library support])
+fi
+AM_CONDITIONAL(USE_SCWS, test x$ac_cv_use_scws != xno)
+
+
 dnl ---
 
 got_expat=0
diff --git a/src/Makefile.am b/src/Makefile.am
index 4312c9ed7..351ca0982 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,7 +3,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
 	sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \
 	sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp
 
-ARFLAGS = crU
+ARFLAGS = cr
 noinst_LIBRARIES = libsphinx.a
 libsphinx_a_SOURCES = $(SRC_SPHINX)
 
@@ -30,6 +30,14 @@ RLP_LIBS =
 RLP_INC =
 endif
 
-AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
-COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+if USE_SCWS
+SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws
+SCWS_INC = -I@ac_cv_use_scws@/include
+else
+SCWS_LIBS =
+SCWS_INC =
+endif
+
+AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $S(CWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS)
 LDADD = $(COMMON_LIBS)
diff --git a/src/Makefile.in b/src/Makefile.in
index 561f08696..5eda13abd 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -180,6 +180,7 @@ abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
 ac_ct_CXX = @ac_ct_CXX@
+#ac_cv_use_scws = @ac_cv_use_scws@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
 am__quote = @am__quote@
@@ -223,7 +224,7 @@ SRC_SPHINX = sphinx.cpp sphinxexcerpt.cpp sphinxquery.cpp \
 	sphinxutils.cpp sphinxstd.cpp sphinxsort.cpp sphinxexpr.cpp sphinxfilter.cpp \
 	sphinxsearch.cpp sphinxrt.cpp sphinxjson.cpp sphinxudf.c sphinxaot.cpp sphinxplugin.cpp
 
-ARFLAGS = crU
+ARFLAGS = cr
 noinst_LIBRARIES = libsphinx.a
 libsphinx_a_SOURCES = $(SRC_SPHINX)
 indexer_SOURCES = indexer.cpp
@@ -237,8 +238,12 @@ BUILT_SOURCES = extract-version
 @USE_RLP_TRUE@RLP_LIBS = -L$(top_srcdir)/rlp/lib/amd64-glibc25-gcc42 -lbtrlpc -lbtrlpcore -lbtutils
 @USE_RLP_FALSE@RLP_INC = 
 @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT
-AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
-COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS)
+@USE_SCWS_FALSE@SCWS_LIBS = 
+@USE_SCWS_TRUE@SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws
+@USE_SCWS_FALSE@SCWS_INC = 
+@USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include
+AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""
+COMMON_LIBS = libsphinx.a $(LIBSTEMMER_LIBS) $(MYSQL_LIBS) $(SCWS_LIBS) $(PGSQL_LIBS) $(LIBRE2_LIBS) $(RLP_LIBS) $(SCWS_LIBS)
 LDADD = $(COMMON_LIBS)
 all: $(BUILT_SOURCES)
 	$(MAKE) $(AM_MAKEFLAGS) all-am
diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index c9d6afca6..9058ca609 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2576,6 +2576,7 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
 
 
 /// SCWS tokenizer
+#if USE_SCWS
 template < bool IS_QUERY >
 class CSphTokenizer_SCWS : public CSphTokenizerBase2
 {
@@ -2588,11 +2589,12 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
         virtual int                 GetCodepointLength ( int iCode ) const;
         virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
 
-      
+  
     scws_t s; 
     scws_res_t cur;
-};
 
+};
+#endif
 
 struct CSphNormalForm
 {
@@ -3811,11 +3813,13 @@ ISphTokenizer * sphCreateUTF8NgramTokenizer ()
 {
 	return new CSphTokenizer_UTF8Ngram<false> ();
 }
+
+#if USE_SCWS
 ISphTokenizer * sphCreateUTF8SCWSTokenizer ()
 {
         return new CSphTokenizer_SCWS<false> ();
 }
-
+#endif
 
 /////////////////////////////////////////////////////////////////////////////
 
@@ -4414,8 +4418,13 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 
 	tSettings.m_iType = tReader.GetByte ();
 
-        if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM && tSettings.m_iType!=TOKENIZER_SCWS)
-	{
+        if ( 
+                tSettings.m_iType!=TOKENIZER_UTF8 
+                && tSettings.m_iType!=TOKENIZER_NGRAM  
+#if USE_SCWS
+                && tSettings.m_iType!=TOKENIZER_SCWS
+#endif
+        ){
 		sWarning = "can't load an old index with SBCS tokenizer";
 		return false;
 	}
@@ -4746,9 +4755,9 @@ ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings,
 	switch ( tSettings.m_iType )
 	{
 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
-
+#if USE_SCWS
 		case TOKENIZER_SCWS:	pTokenizer = sphCreateUTF8SCWSTokenizer (); break;
-                
+#endif               
 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
 		default:
 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
@@ -6445,16 +6454,24 @@ BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
 
 /////////////////////////////////////////////////////////////////////////////
 
+#if USE_SCWS
+
+
+
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 {
+
         s = scws_new();
+
 }
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
 {
+
         scws_free_result(cur);
         scws_free(s);
+
 }
 
 
@@ -6536,7 +6553,7 @@ int CSphTokenizer_SCWS<IS_QUERY>::GetCodepointLength ( int iCode ) const
         assert ( iBytes>=2 && iBytes<=4 );
         return iBytes;
 }
-
+#endif
 //////////////////////////////////////////////////////////////////////////
 
 CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
diff --git a/src/sphinx.h b/src/sphinx.h
index 7ae826155..efafcfc0a 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -504,11 +504,11 @@ struct CSphTokenizerSettings
 	CSphString			m_sBlendMode;
         
 
-        
+ #if USE_SCWS       
         CSphString                      m_scwsDict;
         CSphString                      m_scwsRule;
         int                      m_scwsMulti;
-        
+#endif      
 
 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
 
diff --git a/src/sphinxutils.cpp b/src/sphinxutils.cpp
index c6dc8cbc1..caf80da2e 100644
--- a/src/sphinxutils.cpp
+++ b/src/sphinxutils.cpp
@@ -592,12 +592,12 @@ static KeyDesc_t g_dKeysIndex[] =
 	{ "rlp_context",			0, NULL },
 	{ "ondisk_attrs",			0, NULL },
 	{ "index_token_filter",		0, NULL },
-
+#if USE_SCWS
 	{ "scws",		0, NULL },
 	{ "scws_dict",		0, NULL },
 	{ "scws_rule",		0, NULL },
 	{ "scws_multi",		0, NULL },
-
+#endif
 	{ NULL,						0, NULL }
 };
 
@@ -1269,7 +1269,7 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
 		else
 			sphWarning ( "ngram_chars specified, but ngram_len=0; IGNORED" );
 	}
-
+#if USE_SCWS
         if ( hIndex ( "scws" ) )
         {
                 tSettings.m_iType = TOKENIZER_SCWS;
@@ -1277,7 +1277,7 @@ void sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings
                 tSettings.m_scwsRule = hIndex.GetStr ( "scws_rule" );
                 tSettings.m_scwsMulti = hIndex.GetInt ( "scws_multi",0 );
         }
-
+#endif
 	tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" );
 	tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len", 1 ), 1 );
 	tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" );
diff --git a/src/sphinxutils.h b/src/sphinxutils.h
index b5b82e65c..e92114ef3 100644
--- a/src/sphinxutils.h
+++ b/src/sphinxutils.h
@@ -147,7 +147,9 @@ enum
 	// where was TOKENIZER_SBCS=1 once
 	TOKENIZER_UTF8		= 2,
 	TOKENIZER_NGRAM	= 3
+#if USE_SCWS
 	,TOKENIZER_SCWS	= 4
+#endif
 };
 
 /// load config file

From 3d99dcbbb496e53ce595f0a3ac1a8a27925b80b1 Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Wed, 19 Apr 2017 20:52:22 +0800
Subject: [PATCH 03/14] fix USE_SCWS defined miss

---
 config/config.h.in | 3 +++
 src/Makefile.in    | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/config/config.h.in b/config/config.h.in
index 7eac198a4..6d53f6a65 100644
--- a/config/config.h.in
+++ b/config/config.h.in
@@ -288,6 +288,9 @@
 /* RLP library support */
 #undef USE_RLP
 
+/* SCWS library support */
+#undef USE_SCWS
+
 /* define to use POSIX Syslog for logging */
 #undef USE_SYSLOG
 
diff --git a/src/Makefile.in b/src/Makefile.in
index 5eda13abd..83b69d8f6 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -180,7 +180,6 @@ abs_top_builddir = @abs_top_builddir@
 abs_top_srcdir = @abs_top_srcdir@
 ac_ct_CC = @ac_ct_CC@
 ac_ct_CXX = @ac_ct_CXX@
-#ac_cv_use_scws = @ac_cv_use_scws@
 am__include = @am__include@
 am__leading_dot = @am__leading_dot@
 am__quote = @am__quote@

From 02aa41cf01a2b68729efb781cbe0e36d7c81fe2b Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Wed, 19 Apr 2017 21:22:39 +0800
Subject: [PATCH 04/14] fix SetCaseFolding

---
 src/sphinx.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 9058ca609..11d950122 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6462,7 +6462,10 @@ template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 {
 
-        s = scws_new();
+	CSphString sTmp;
+	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
+	m_bHasBlend = false;
+	s = scws_new();
 
 }
 template < bool IS_QUERY >

From c5d4b7e7c1c39c869d50d921a4d2a5203112adcb Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Thu, 20 Apr 2017 19:35:15 +0800
Subject: [PATCH 05/14] fix scws_set_ignore default true

---
 src/sphinx.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 9058ca609..cf71ccbeb 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6490,6 +6490,7 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
             scws_set_rule(s, m_tSettings.m_scwsDict.cstr ());
 	}
         scws_set_charset(s, "utf8");
+		scws_set_ignore(s, true);
         
         
         if ( m_tSettings.m_scwsMulti)

From d5b39bd7921f618c2908d36ed9d17cd0fb3abfd5 Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Fri, 21 Apr 2017 09:55:59 +0800
Subject: [PATCH 06/14] static link

---
 src/Makefile.am | 2 +-
 src/Makefile.in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index 351ca0982..0874cf503 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -31,7 +31,7 @@ RLP_INC =
 endif
 
 if USE_SCWS
-SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws
+SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a
 SCWS_INC = -I@ac_cv_use_scws@/include
 else
 SCWS_LIBS =
diff --git a/src/Makefile.in b/src/Makefile.in
index 83b69d8f6..491381fc7 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -238,7 +238,7 @@ BUILT_SOURCES = extract-version
 @USE_RLP_FALSE@RLP_INC = 
 @USE_RLP_TRUE@RLP_INC = -I$(top_srcdir)/rlp/rlp/include -I$(top_srcdir)/rlp/utilities/include -D_REENTRANT
 @USE_SCWS_FALSE@SCWS_LIBS = 
-@USE_SCWS_TRUE@SCWS_LIBS = -L@ac_cv_use_scws@/lib/ -lscws
+@USE_SCWS_TRUE@SCWS_LIBS = @ac_cv_use_scws@/lib/libscws.a
 @USE_SCWS_FALSE@SCWS_INC = 
 @USE_SCWS_TRUE@SCWS_INC = -I@ac_cv_use_scws@/include
 AM_CPPFLAGS = $(LIBRE2_CFLAGS) $(RLP_INC) $(SCWS_INC) -DSYSCONFDIR="\"$(sysconfdir)\"" -DDATADIR="\"$(localstatedir)/data\""

From 7308c936dda88c63e5091814dca28b6b7548ccab Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Fri, 21 Apr 2017 10:56:06 +0800
Subject: [PATCH 07/14] fix set in setting

---
 src/sphinx.cpp | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 97858c006..23ce7e18b 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2586,12 +2586,32 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
         virtual void                SetBuffer ( const BYTE * sBuffer, int iLength );
         virtual BYTE *              GetToken ();
         virtual ISphTokenizer *     Clone ( ESphTokenizerClone eMode ) const;
+		virtual void				Setup ( const CSphTokenizerSettings & tSettings )				{ 
+			CSphTokenizerBase2::Setup ( tSettings ); 
+			if ( !m_tSettings.m_scwsDict.IsEmpty ()  )
+			{ 
+				scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+			}
+			if ( !m_tSettings.m_scwsRule.IsEmpty ())
+			{ 
+				scws_set_rule(s, m_tSettings.m_scwsDict.cstr ());
+			}
+			scws_set_charset(s, "utf8");
+			scws_set_ignore(s, true);
+
+
+			if ( m_tSettings.m_scwsMulti)
+			{ 
+				scws_set_multi(s, m_tSettings.m_scwsMulti << 12);
+			}else{
+				scws_set_multi(s, 0);
+			}
+		}
         virtual int                 GetCodepointLength ( int iCode ) const;
         virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
 
-  
-    scws_t s; 
-    scws_res_t cur;
+		scws_t s; 
+		scws_res_t cur;
 
 };
 #endif
@@ -6484,24 +6504,6 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
         // check that old one is over and that new length is sane
         assert ( iLength>=0 );
         
-        if ( !m_tSettings.m_scwsDict.IsEmpty ()  )
-	{ 
-            scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
-	}
-        if ( !m_tSettings.m_scwsRule.IsEmpty ())
-	{ 
-            scws_set_rule(s, m_tSettings.m_scwsDict.cstr ());
-	}
-        scws_set_charset(s, "utf8");
-		scws_set_ignore(s, true);
-        
-        
-        if ( m_tSettings.m_scwsMulti)
-	{ 
-            scws_set_multi(s, m_tSettings.m_scwsMulti << 12);
-	}else{
-            scws_set_multi(s, 0);
-        }
         m_pBuffer = sBuffer;
         scws_send_text(s, (char*)m_pBuffer, iLength);
 }

From e2484ed4cb85fd0e76ced631f50888424986ff4f Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Mon, 24 Apr 2017 12:26:58 +0800
Subject: [PATCH 08/14] fix query bug of scws gettoken

---
 src/sphinx.cpp | 318 +++++++++++++++++++++++++++++++++++++++++++++----
 src/sphinx.h   |   1 +
 2 files changed, 293 insertions(+), 26 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 23ce7e18b..dd3bd09b0 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2588,30 +2588,31 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
         virtual ISphTokenizer *     Clone ( ESphTokenizerClone eMode ) const;
 		virtual void				Setup ( const CSphTokenizerSettings & tSettings )				{ 
 			CSphTokenizerBase2::Setup ( tSettings ); 
-			if ( !m_tSettings.m_scwsDict.IsEmpty ()  )
+			if ( !tSettings.m_scwsDict.IsEmpty ()  )
 			{ 
-				scws_set_dict(s, m_tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+				scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
 			}
-			if ( !m_tSettings.m_scwsRule.IsEmpty ())
+			if ( !tSettings.m_scwsRule.IsEmpty ())
 			{ 
-				scws_set_rule(s, m_tSettings.m_scwsDict.cstr ());
+				scws_set_rule(s, tSettings.m_scwsDict.cstr ());
 			}
 			scws_set_charset(s, "utf8");
 			scws_set_ignore(s, true);
 
 
-			if ( m_tSettings.m_scwsMulti)
+			if ( tSettings.m_scwsMulti)
 			{ 
-				scws_set_multi(s, m_tSettings.m_scwsMulti << 12);
+				scws_set_multi(s, tSettings.m_scwsMulti << 12);
 			}else{
 				scws_set_multi(s, 0);
 			}
 		}
         virtual int                 GetCodepointLength ( int iCode ) const;
         virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
+	const BYTE * m_pText;
 
 		scws_t s; 
-		scws_res_t cur;
+		scws_res_t res,cur;
 
 };
 #endif
@@ -4475,6 +4476,11 @@ bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSett
 		tSettings.m_sBlendChars = tReader.GetString ();
 	if ( uVersion>=24 )
 		tSettings.m_sBlendMode = tReader.GetString();
+#if USE_SCWS
+	tSettings.m_scwsMulti= tReader.GetDword();
+	tSettings.m_scwsDict= tReader.GetString();
+	tSettings.m_scwsRule= tReader.GetString();
+#endif
 
 	return true;
 }
@@ -4504,6 +4510,11 @@ void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, i
 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
+#if USE_SCWS
+	tWriter.PutDword( tSettings.m_scwsMulti);
+	tWriter.PutString ( tSettings.m_scwsDict.cstr()) ;
+	tWriter.PutString ( tSettings.m_scwsRule.cstr());
+#endif
 }
 
 
@@ -6491,10 +6502,7 @@ CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
 {
-
-        scws_free_result(cur);
         scws_free(s);
-
 }
 
 
@@ -6503,28 +6511,285 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 {
         // check that old one is over and that new length is sane
         assert ( iLength>=0 );
+
+	// set buffer
+	m_pTokenStart = m_pTokenEnd = NULL;
+	m_pBlendStart = m_pBlendEnd = NULL;
+
+	m_pText = m_pBuffer = sBuffer;
+	m_pBufferMax = sBuffer + iLength;
+	m_pCur = sBuffer;
+
+	m_iOvershortCount = 0;
+	m_bBoundary = m_bTokenBoundary = false;
         
-        m_pBuffer = sBuffer;
-        scws_send_text(s, (char*)m_pBuffer, iLength);
+	res = cur = NULL;
+        scws_send_text(s, (char*)m_pText, iLength);
 }
 
 
 template < bool IS_QUERY >
 BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 {
-        if (cur == NULL)
-        {
-            cur = scws_get_result(s);
-            if(cur == NULL){
-                return NULL;
-            }
-        }
-        memcpy(m_sAccum, m_pBuffer+cur->off, cur->len);
-        m_sAccum[cur->len]='\0';
-        m_pCur += cur->off;
-        cur = cur->next;
-        return m_sAccum;
-        
+	m_bWasSpecial = false;
+	m_bBlended = false;
+	m_iOvershortCount = 0;
+	m_bTokenBoundary = false;
+	m_bWasSynonym = false;
+	if( m_bHasBlend)
+	{
+		BYTE * pVar = GetBlendedVariant ();
+		if ( pVar )
+			return pVar;
+		m_bBlendedPart = ( m_pBlendEnd!=NULL );
+	}
+
+	bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
+	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
+
+	m_pTokenStart = NULL;
+	for ( ;; )
+	{
+		// get next codepoint
+		const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already
+
+		if(cur !=NULL){
+			memcpy(m_sAccum, m_pText + cur->off, cur->len);
+			m_sAccum[cur->len]='\0';
+			cur = cur->next;
+			return m_sAccum;
+		}
+		m_pText = m_pCur;
+
+
+		int iCodePoint;
+		int iCode;
+		if ( pCur<m_pBufferMax && *pCur<128 )
+		{
+			iCodePoint = *m_pCur++;
+			iCode = m_tLC.m_pChunk[0][iCodePoint];
+		} else
+		{
+			iCodePoint = GetCodepoint(); // advances m_pCur
+			iCode = m_tLC.ToLower ( iCodePoint );
+		}
+
+		// handle escaping
+		bool bWasEscaped = ( IS_QUERY && iCodePoint=='\\' ); // whether current codepoint was escaped
+		if ( bWasEscaped )
+		{
+			iCodePoint = GetCodepoint();
+			iCode = m_tLC.ToLower ( iCodePoint );
+			if ( !Special2Simple ( iCode ) )
+				iCode = 0;
+		}
+		// handle eof
+		if ( iCode<0 )
+		{
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && m_pTokenStart && CheckException ( m_pTokenStart, pCur, IS_QUERY ) )
+				return m_sAccum;
+
+			// skip trailing short word
+			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen )
+			{
+				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) )
+				{
+					if ( m_iLastTokenLen )
+						m_iOvershortCount++;
+					m_iLastTokenLen = 0;
+
+					if( m_bHasBlend)
+						BlendAdjust ( pCur );
+					return NULL;
+				}
+			}
+
+			// keep token end here as BlendAdjust might change m_pCur
+			m_pTokenEnd = m_pCur;
+			if( m_bHasBlend&& !BlendAdjust ( pCur ) )
+				return NULL;
+			if( m_bHasBlend&& m_bBlended )
+				return GetBlendedVariant();
+
+			// return trailing word
+			return m_sAccum;
+		}
+
+		// handle all the flags..
+		if_const ( IS_QUERY )
+			iCode = CodepointArbitrationQ ( iCode, bWasEscaped, *m_pCur );
+		else if ( m_bDetectSentences )
+			iCode = CodepointArbitrationI ( iCode );
+
+		// handle ignored chars
+		if ( iCode & FLAG_CODEPOINT_IGNORE ){
+			continue;
+		}
+
+		// handle blended characters
+		if( m_bHasBlend&& ( iCode & FLAG_CODEPOINT_BLEND ) )
+		{
+			if ( m_pBlendEnd )
+				iCode = 0;
+			else
+			{
+				m_bBlended = true;
+				m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
+			}
+		}
+
+		// handle soft-whitespace-only tokens
+		if ( !bGotNonToken && !m_iAccum )
+		{
+			if ( !bGotSoft )
+			{
+				// detect opening soft whitespace
+				if ( ( iCode==0 && !IsWhitespace ( iCodePoint ) && !IsPunctuation ( iCodePoint ) )
+						|| ( ( iCode & FLAG_CODEPOINT_BLEND ) && !m_iAccum ) )
+				{
+					bGotSoft = true;
+				}
+			} else
+			{
+				// detect closing hard whitespace or special
+				// (if there was anything meaningful in the meantime, we must never get past the outer if!)
+				if ( IsWhitespace ( iCodePoint ) || ( iCode & FLAG_CODEPOINT_SPECIAL ) )
+				{
+					m_iOvershortCount++;
+					bGotNonToken = true;
+				}
+			}
+		}
+
+		// handle whitespace and boundary
+		if ( m_bBoundary && ( iCode==0 ) )
+		{
+			m_bTokenBoundary = true;
+			m_iBoundaryOffset = pCur - m_pBuffer - 1;
+		}
+		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
+
+		// handle separator (aka, most likely a token!)
+		if ( iCode==0 || m_bBoundary )
+		{
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && CheckException ( m_pTokenStart ? m_pTokenStart : pCur, pCur, IS_QUERY ) ){
+				return m_sAccum;
+			}
+
+			if( m_bHasBlend&& !BlendAdjust ( pCur ) ){
+				continue;
+			}
+
+
+			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen
+					&& !( m_bShortTokenFilter && ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) ) )
+			{
+				if ( m_iLastTokenLen )
+					m_iOvershortCount++;
+				continue;
+			} else
+			{
+				m_pTokenEnd = pCur;
+				if( m_bHasBlend&& m_bBlended ){
+					return GetBlendedVariant();
+				}
+				return m_sAccum;
+			}
+		}
+
+		// handle specials
+		if ( iCode & FLAG_CODEPOINT_SPECIAL )
+		{
+			// skip short words preceding specials
+			if ( m_iAccum<m_tSettings.m_iMinWordLen )
+			{
+				m_sAccum[m_iAccum] = '\0';
+
+				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iAccum ) )
+				{
+					if ( m_iAccum )
+						m_iOvershortCount++;
+
+					FlushAccum ();
+				}
+			}
+
+			if ( m_iAccum==0 )
+			{
+				m_bNonBlended = m_bNonBlended || ( !( iCode & FLAG_CODEPOINT_BLEND ) && !( iCode & FLAG_CODEPOINT_SPECIAL ) );
+				m_bWasSpecial = !( iCode & FLAG_CODEPOINT_NGRAM );
+				m_pTokenStart = pCur;
+				m_pTokenEnd = m_pCur;
+				AccumCodepoint ( iCode & MASK_CODEPOINT ); // handle special as a standalone token
+			} else
+			{
+				m_pCur = pCur; // we need to flush current accum and then redo special char again
+				m_pTokenEnd = pCur;
+			}
+			FlushAccum ();
+
+			// suddenly, exceptions
+			if ( m_pExc && CheckException ( m_pTokenStart, pCur, IS_QUERY ) ){
+				return m_sAccum;
+			}
+			if( m_bHasBlend)
+			{
+				if ( !BlendAdjust ( pCur ) )
+					continue;
+				if ( m_bBlended )
+					return GetBlendedVariant();
+			}
+
+			return m_sAccum;
+		}
+
+		if ( m_iAccum==0 )
+			m_pTokenStart = pCur;
+
+		// tricky bit
+		// heading modifiers must not (!) affected blended status
+		// eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
+
+		if( m_bHasBlend)
+			if_const (!( IS_QUERY && !m_iAccum && sphIsModifier ( iCode & MASK_CODEPOINT ) ) )
+				m_bNonBlended = m_bNonBlended || !( iCode & FLAG_CODEPOINT_BLEND );
+		// just accumulate
+		// manual inlining of utf8 encoder gives us a few extra percent
+		// which is important here, this is a hotspot
+		if ( m_iAccum<SPH_MAX_WORD_LEN && ( m_pAccum-m_sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(m_sAccum) ) )
+		{
+			iCode &= MASK_CODEPOINT;
+			m_iAccum++;
+
+			scws_send_text(s, (char*)m_pText, strlen((char*)m_pText));
+			res = (cur = scws_get_result(s));//只读取一个单词
+			if(cur == NULL){
+				FlushAccum();
+				return NULL;
+			}
+
+			memcpy(m_sAccum, pCur+cur->off, cur->len);
+			m_sAccum[cur->len]='\0';
+
+			m_pTokenStart = m_pText + cur->off;
+			m_pCur = m_pText + cur->off + cur->len;
+			m_pTokenEnd = m_pCur;
+
+			cur = cur->next;
+			if(cur == NULL){
+				m_iLastTokenLen = 0;
+				m_iAccum = 0;
+				scws_free_result(res);
+			}
+			return m_sAccum;
+		}
+	}
 }
 
 template < bool IS_QUERY >
@@ -6533,11 +6798,12 @@ ISphTokenizer * CSphTokenizer_SCWS<IS_QUERY>::Clone ( ESphTokenizerClone eMode )
         if ( eMode!=SPH_CLONE_INDEX ) {
                 CSphTokenizer_SCWS<true> *pClone = new CSphTokenizer_SCWS<true>();
                 pClone->CloneBase ( this, eMode );
+		pClone->Setup(m_tSettings);
                 return pClone;
-
         } else {
                 CSphTokenizer_SCWS<false> *pClone = new CSphTokenizer_SCWS<false>();
                 pClone->CloneBase ( this, eMode );
+		pClone->Setup(m_tSettings);
                 return pClone;
         }
 }
diff --git a/src/sphinx.h b/src/sphinx.h
index efafcfc0a..384a6222e 100644
--- a/src/sphinx.h
+++ b/src/sphinx.h
@@ -455,6 +455,7 @@ class CSphLowercaser
 
 	int					m_iChunks;					///< how much chunks are actually allocated
 	int *				m_pData;					///< chunks themselves
+public:
 	int *				m_pChunk [ CHUNK_COUNT ];	///< pointers to non-empty chunks
 };
 

From f004fc7c1d6768c234ae2994a8d7cde71651e1f0 Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Mon, 24 Apr 2017 13:41:59 +0800
Subject: [PATCH 09/14] disable memory dict

---
 src/sphinx.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index dd3bd09b0..9daca7aab 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2590,7 +2590,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
 			CSphTokenizerBase2::Setup ( tSettings ); 
 			if ( !tSettings.m_scwsDict.IsEmpty ()  )
 			{ 
-				scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+				scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB);
 			}
 			if ( !tSettings.m_scwsRule.IsEmpty ())
 			{ 

From 8d6e8325aacf2611758f226440c871c545684fbb Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Mon, 24 Apr 2017 14:05:41 +0800
Subject: [PATCH 10/14] fix optimize performance

---
 src/sphinx.cpp | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 9daca7aab..e065d57ae 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2577,6 +2577,9 @@ class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
 
 /// SCWS tokenizer
 #if USE_SCWS
+scws_t scws_global; 
+int scws_config_set=false;
+
 template < bool IS_QUERY >
 class CSphTokenizer_SCWS : public CSphTokenizerBase2
 {
@@ -2586,34 +2589,36 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
         virtual void                SetBuffer ( const BYTE * sBuffer, int iLength );
         virtual BYTE *              GetToken ();
         virtual ISphTokenizer *     Clone ( ESphTokenizerClone eMode ) const;
-		virtual void				Setup ( const CSphTokenizerSettings & tSettings )				{ 
-			CSphTokenizerBase2::Setup ( tSettings ); 
+	virtual void		Setup ( const CSphTokenizerSettings & tSettings ){
+		CSphTokenizerBase2::Setup ( tSettings ); 
+		if(scws_config_set==false){
+			scws_config_set=true;
 			if ( !tSettings.m_scwsDict.IsEmpty ()  )
 			{ 
-				scws_set_dict(s, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB);
+				scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
 			}
 			if ( !tSettings.m_scwsRule.IsEmpty ())
 			{ 
-				scws_set_rule(s, tSettings.m_scwsDict.cstr ());
+				scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ());
 			}
-			scws_set_charset(s, "utf8");
-			scws_set_ignore(s, true);
+			scws_set_charset(scws_global, "utf8");
+			scws_set_ignore(scws_global, true);
 
 
 			if ( tSettings.m_scwsMulti)
 			{ 
-				scws_set_multi(s, tSettings.m_scwsMulti << 12);
+				scws_set_multi(scws_global, tSettings.m_scwsMulti << 12);
 			}else{
-				scws_set_multi(s, 0);
+				scws_set_multi(scws_global, 0);
 			}
 		}
+		scws_source = scws_fork(scws_global);
+	}
         virtual int                 GetCodepointLength ( int iCode ) const;
         virtual int                 GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
 	const BYTE * m_pText;
-
-		scws_t s; 
-		scws_res_t res,cur;
-
+	scws_res_t res,cur;
+	scws_t scws_source; 
 };
 #endif
 
@@ -6496,13 +6501,12 @@ CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
 	m_bHasBlend = false;
-	s = scws_new();
-
+	if(scws_global==NULL) scws_global = scws_new();
 }
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
 {
-        scws_free(s);
+	scws_free(scws_source);
 }
 
 
@@ -6524,7 +6528,7 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 	m_bBoundary = m_bTokenBoundary = false;
         
 	res = cur = NULL;
-        scws_send_text(s, (char*)m_pText, iLength);
+        scws_send_text(scws_source, (char*)m_pText, iLength);
 }
 
 
@@ -6767,8 +6771,8 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 			iCode &= MASK_CODEPOINT;
 			m_iAccum++;
 
-			scws_send_text(s, (char*)m_pText, strlen((char*)m_pText));
-			res = (cur = scws_get_result(s));//只读取一个单词
+			scws_send_text(scws_source, (char*)m_pText, strlen((char*)m_pText));
+			res = (cur = scws_get_result(scws_source));//只读取一个单词
 			if(cur == NULL){
 				FlushAccum();
 				return NULL;

From 6a32ad0532288ba4d8c4ab22c9e2d7d0e62976c0 Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Mon, 24 Apr 2017 18:33:53 +0800
Subject: [PATCH 11/14] ignore case with scws

---
 src/sphinx.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index e065d57ae..06f43b1ff 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6560,6 +6560,7 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 		if(cur !=NULL){
 			memcpy(m_sAccum, m_pText + cur->off, cur->len);
 			m_sAccum[cur->len]='\0';
+			sphColumnToLowercase ( (char *)( m_sAccum ) );
 			cur = cur->next;
 			return m_sAccum;
 		}
@@ -6780,6 +6781,7 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 
 			memcpy(m_sAccum, pCur+cur->off, cur->len);
 			m_sAccum[cur->len]='\0';
+			sphColumnToLowercase ( (char *)( m_sAccum ) );
 
 			m_pTokenStart = m_pText + cur->off;
 			m_pCur = m_pText + cur->off + cur->len;

From 5dc388c609269787569a54600b97789c013871d8 Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Thu, 27 Apr 2017 16:24:03 +0800
Subject: [PATCH 12/14] fix memory leak

---
 src/sphinx.cpp | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 06f43b1ff..ac410f061 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6501,7 +6501,9 @@ CSphTokenizer_SCWS<IS_QUERY>::CSphTokenizer_SCWS ()
 	CSphString sTmp;
 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
 	m_bHasBlend = false;
-	if(scws_global==NULL) scws_global = scws_new();
+	if(scws_global==NULL) {
+		scws_global = scws_new();
+	}
 }
 template < bool IS_QUERY >
 CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
@@ -6513,8 +6515,8 @@ CSphTokenizer_SCWS<IS_QUERY>::~CSphTokenizer_SCWS ()
 template < bool IS_QUERY >
 void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
 {
-        // check that old one is over and that new length is sane
-        assert ( iLength>=0 );
+	// check that old one is over and that new length is sane
+	assert ( iLength>=0 );
 
 	// set buffer
 	m_pTokenStart = m_pTokenEnd = NULL;
@@ -6526,9 +6528,9 @@ void CSphTokenizer_SCWS<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength
 
 	m_iOvershortCount = 0;
 	m_bBoundary = m_bTokenBoundary = false;
-        
+
 	res = cur = NULL;
-        scws_send_text(scws_source, (char*)m_pText, iLength);
+	scws_send_text(scws_source, (char*)m_pText, iLength);
 }
 
 
@@ -6558,11 +6560,18 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 		const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already
 
 		if(cur !=NULL){
-			memcpy(m_sAccum, m_pText + cur->off, cur->len);
-			m_sAccum[cur->len]='\0';
-			sphColumnToLowercase ( (char *)( m_sAccum ) );
 			cur = cur->next;
-			return m_sAccum;
+			if(cur != NULL){
+				memcpy(m_sAccum, m_pText + cur->off, cur->len);
+				m_sAccum[cur->len]='\0';
+				sphColumnToLowercase ( (char *)( m_sAccum ) );
+				return m_sAccum;
+			}else{
+				m_iLastTokenLen = 0;
+				m_iAccum = 0;
+				scws_free_result(res);
+			}
+
 		}
 		m_pText = m_pCur;
 
@@ -6787,12 +6796,6 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 			m_pCur = m_pText + cur->off + cur->len;
 			m_pTokenEnd = m_pCur;
 
-			cur = cur->next;
-			if(cur == NULL){
-				m_iLastTokenLen = 0;
-				m_iAccum = 0;
-				scws_free_result(res);
-			}
 			return m_sAccum;
 		}
 	}

From da1635f7e0b3ede18d45aa9f0cef913e1658b421 Mon Sep 17 00:00:00 2001
From: hetao <hetao@talkweb.com.cn>
Date: Thu, 27 Apr 2017 18:05:48 +0800
Subject: [PATCH 13/14] increase index speed

---
 src/sphinx.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index ac410f061..286e35330 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -6550,6 +6550,25 @@ BYTE * CSphTokenizer_SCWS<IS_QUERY>::GetToken ()
 		m_bBlendedPart = ( m_pBlendEnd!=NULL );
 	}
 
+	if(!IS_QUERY){
+		if(cur == NULL){
+			res = (cur = scws_get_result(scws_source));
+			if(cur == NULL){
+				return NULL;
+			}
+		}
+		memcpy(m_sAccum, m_pText + cur->off, cur->len);
+		m_sAccum[cur->len]='\0';
+		sphColumnToLowercase ( (char *)( m_sAccum ) );
+		m_iLastTokenLen = 0;
+		m_iAccum = 0;
+		cur = cur->next;
+		if(cur == NULL){
+			scws_free_result(res);
+		}
+		return m_sAccum;
+	}
+
 	bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
 	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
 

From 84c2be81f52df3ff9e3beff7f4b3d5b08e57af7d Mon Sep 17 00:00:00 2001
From: hetao <hetao@hetao.name>
Date: Fri, 28 Apr 2017 14:22:08 +0800
Subject: [PATCH 14/14] support multi dict and xdb format

---
 src/sphinx.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
index 286e35330..da5ac169e 100644
--- a/src/sphinx.cpp
+++ b/src/sphinx.cpp
@@ -2595,11 +2595,27 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
 			scws_config_set=true;
 			if ( !tSettings.m_scwsDict.IsEmpty ()  )
 			{ 
-				scws_set_dict(scws_global, tSettings.m_scwsDict.cstr (), SCWS_XDICT_TXT | SCWS_XDICT_XDB | SCWS_XDICT_MEM);
+
+				int mode,ret;
+
+				CSphVector<CSphString> dicts;
+
+				sphSplit ( dicts, tSettings.m_scwsDict.cstr() ," \t,;");
+				ARRAY_FOREACH ( i, dicts)
+				{
+					mode = SCWS_XDICT_MEM | SCWS_XDICT_XDB;
+					if (!dicts[i].Ends(".xdb")){
+						mode |= SCWS_XDICT_TXT;
+					}
+					ret = scws_add_dict(scws_global, dicts[i].cstr (), mode);
+					sphInfo("scws set dict [%s], mode [%d], ret [%d]",dicts[i].cstr (),mode,ret);
+				}
+
 			}
 			if ( !tSettings.m_scwsRule.IsEmpty ())
 			{ 
-				scws_set_rule(scws_global, tSettings.m_scwsDict.cstr ());
+				scws_set_rule(scws_global, tSettings.m_scwsRule.cstr ());
+				sphInfo("scws set rule [%s]",tSettings.m_scwsRule.cstr ());
 			}
 			scws_set_charset(scws_global, "utf8");
 			scws_set_ignore(scws_global, true);
@@ -2608,6 +2624,7 @@ class CSphTokenizer_SCWS : public CSphTokenizerBase2
 			if ( tSettings.m_scwsMulti)
 			{ 
 				scws_set_multi(scws_global, tSettings.m_scwsMulti << 12);
+				sphInfo("scws set muliti[%d]",tSettings.m_scwsMulti);
 			}else{
 				scws_set_multi(scws_global, 0);
 			}