diff --git a/.gitmodules b/.gitmodules index 950af431..70488cd3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "src/llama.cpp"] path = src/llama.cpp - url = git@github.com:Bip-Rep/llama.cpp.git + url = https://github.com/ggerganov/llama.cpp.git diff --git a/README.md b/README.md index 6118625a..a64d9800 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,16 @@ # Demo App for llama.cpp Model This app is a demo of the llama.cpp model that tries to recreate an offline chatbot, working similar to OpenAI's ChatGPT. The source code for this app is available on GitHub. -# Now it works with Vicuna !!! -You can use the latest models on the app. - # Works on multiple devices : Windows, mac and android ! [Releases page](https://github.com/Bip-Rep/sherpa/releases) -The app was developed using Flutter and implements ggerganov/llama.cpp, recompiled to work on mobiles. Please note that Meta officially distributes the LLaMA models, and they will not be provided by the app developers. +The app was developed using Flutter and is built upon ggerganov/llama.cpp, recompiled to work on mobiles. + +The app will prompt you for a model file, which you need to provide. This must be a GGML model is compatible with llama.cpp as of June 30th 2022. Models from previous versions of Sherpa (which used an older version of llama.cpp) may no longer be compatible until conversion. The llama.cpp repository provides tools to convert ggml models to the latest format, as well as to produce ggml models from the original datasets. -To run this app, you need to download the 7B llama model from Meta for research purposes. You can choose the target model (should be a xxx.bin) from the app. +You can experiment with [Orca Mini 3B](https://huggingface.co/TheBloke/orca_mini_3B-GGML). These models (e.g. orca-mini-3b.ggmlv3.q4_0.bin from June 24 2023) are directly compatible with this version of Sherpa. Additionally, you can fine-tune the ouput with preprompts to improve its performance. @@ -23,7 +22,7 @@ It shows a OnePlus 7 with 8Gb running Sherpa without speed up. ## Usage To use this app, follow these steps: -1. Download the `ggml-model.bin` from Meta for research purposes. +1. Download the GGML model from your chosen source. 2. Rename the downloaded file to `ggml-model.bin`. 3. Place the file in your device's download folder. 4. Run the app on your mobile device. diff --git a/android/app/build.gradle b/android/app/build.gradle index 07bee95a..6886b890 100644 --- a/android/app/build.gradle +++ b/android/app/build.gradle @@ -48,6 +48,10 @@ android { } defaultConfig { + ndk { + abiFilters 'arm64-v8a', 'x86_64' + } + // TODO: Specify your own unique Application ID (https://developer.android.com/studio/build/application-id.html). applicationId "com.biprep.sherpa" // You can update the following values to match your application needs. @@ -59,32 +63,23 @@ android { } signingConfigs { - - release { - storeFile keystoreProperties['storeFile'] ? file(keystoreProperties['storeFile']) : null - storePassword keystoreProperties['storePassword'] - keyAlias keystoreProperties['releaseAlias'] - keyPassword keystoreProperties['releasePassword'] - } - - debug { - storeFile keystoreProperties['storeFile'] ? file(keystoreProperties['storeFile']) : null - storePassword keystoreProperties['storePassword'] - keyAlias keystoreProperties['releaseAlias'] - keyPassword keystoreProperties['releasePassword'] - } - - } - - buildTypes { - release { - signingConfig signingConfigs.release - proguardFiles getDefaultProguardFile("proguard-android.txt"), "proguard-rules.pro" - } - - debug { - signingConfig signingConfigs.release - proguardFiles getDefaultProguardFile("proguard-android.txt"), "proguard-rules.pro" + release { + keyAlias keystoreProperties['keyAlias'] + keyPassword keystoreProperties['keyPassword'] + storeFile keystoreProperties['storeFile'] ? file(keystoreProperties['storeFile']) : null + storePassword keystoreProperties['storePassword'] + } + } + + buildTypes { + release { + signingConfig signingConfigs.release + } + } + + externalNativeBuild { + cmake { + path "../../src/CMakeLists.txt" } } } diff --git a/assets/libs/libllama.so b/assets/libs/libllama.so deleted file mode 100755 index 957f9e89..00000000 Binary files a/assets/libs/libllama.so and /dev/null differ diff --git a/assets/libs/llama.dll b/assets/libs/llama.dll deleted file mode 100644 index 6378a8ef..00000000 Binary files a/assets/libs/llama.dll and /dev/null differ diff --git a/lib/generated_bindings_llama.dart b/lib/generated_bindings_llama.dart deleted file mode 100644 index e50b5c7f..00000000 --- a/lib/generated_bindings_llama.dart +++ /dev/null @@ -1,907 +0,0 @@ -// AUTO GENERATED FILE, DO NOT EDIT. -// -// Generated by `package:ffigen`. -import 'dart:ffi' as ffi; - -class NativeLibrary { - /// Holds the symbol lookup function. - final ffi.Pointer Function(String symbolName) - _lookup; - - /// The symbols are looked up in [dynamicLibrary]. - NativeLibrary(ffi.DynamicLibrary dynamicLibrary) - : _lookup = dynamicLibrary.lookup; - - /// The symbols are looked up with [lookup]. - NativeLibrary.fromLookup( - ffi.Pointer Function(String symbolName) - lookup) - : _lookup = lookup; - - void __va_start( - ffi.Pointer arg0, - ) { - return ___va_start( - arg0, - ); - } - - late final ___va_startPtr = - _lookup)>>( - '__va_start'); - late final ___va_start = - ___va_startPtr.asFunction)>(); - - void __security_init_cookie() { - return ___security_init_cookie(); - } - - late final ___security_init_cookiePtr = - _lookup>( - '__security_init_cookie'); - late final ___security_init_cookie = - ___security_init_cookiePtr.asFunction(); - - void __security_check_cookie( - int _StackCookie, - ) { - return ___security_check_cookie( - _StackCookie, - ); - } - - late final ___security_check_cookiePtr = - _lookup>( - '__security_check_cookie'); - late final ___security_check_cookie = - ___security_check_cookiePtr.asFunction(); - - void __report_gsfailure( - int _StackCookie, - ) { - return ___report_gsfailure( - _StackCookie, - ); - } - - late final ___report_gsfailurePtr = - _lookup>( - '__report_gsfailure'); - late final ___report_gsfailure = - ___report_gsfailurePtr.asFunction(); - - late final ffi.Pointer ___security_cookie = - _lookup('__security_cookie'); - - int get __security_cookie => ___security_cookie.value; - - set __security_cookie(int value) => ___security_cookie.value = value; - - void _invalid_parameter_noinfo() { - return __invalid_parameter_noinfo(); - } - - late final __invalid_parameter_noinfoPtr = - _lookup>( - '_invalid_parameter_noinfo'); - late final __invalid_parameter_noinfo = - __invalid_parameter_noinfoPtr.asFunction(); - - void _invalid_parameter_noinfo_noreturn() { - return __invalid_parameter_noinfo_noreturn(); - } - - late final __invalid_parameter_noinfo_noreturnPtr = - _lookup>( - '_invalid_parameter_noinfo_noreturn'); - late final __invalid_parameter_noinfo_noreturn = - __invalid_parameter_noinfo_noreturnPtr.asFunction(); - - void _invoke_watson( - ffi.Pointer _Expression, - ffi.Pointer _FunctionName, - ffi.Pointer _FileName, - int _LineNo, - int _Reserved, - ) { - return __invoke_watson( - _Expression, - _FunctionName, - _FileName, - _LineNo, - _Reserved, - ); - } - - late final __invoke_watsonPtr = _lookup< - ffi.NativeFunction< - ffi.Void Function( - ffi.Pointer, - ffi.Pointer, - ffi.Pointer, - ffi.UnsignedInt, - ffi.UintPtr)>>('_invoke_watson'); - late final __invoke_watson = __invoke_watsonPtr.asFunction< - void Function(ffi.Pointer, ffi.Pointer, - ffi.Pointer, int, int)>(); - - ffi.Pointer _errno() { - return __errno(); - } - - late final __errnoPtr = - _lookup Function()>>('_errno'); - late final __errno = __errnoPtr.asFunction Function()>(); - - int _set_errno( - int _Value, - ) { - return __set_errno( - _Value, - ); - } - - late final __set_errnoPtr = - _lookup>('_set_errno'); - late final __set_errno = __set_errnoPtr.asFunction(); - - int _get_errno( - ffi.Pointer _Value, - ) { - return __get_errno( - _Value, - ); - } - - late final __get_errnoPtr = - _lookup)>>( - '_get_errno'); - late final __get_errno = - __get_errnoPtr.asFunction)>(); - - int __threadid() { - return ___threadid(); - } - - late final ___threadidPtr = - _lookup>('__threadid'); - late final ___threadid = ___threadidPtr.asFunction(); - - int __threadhandle() { - return ___threadhandle(); - } - - late final ___threadhandlePtr = - _lookup>('__threadhandle'); - late final ___threadhandle = ___threadhandlePtr.asFunction(); - - llama_context_params llama_context_default_params() { - return _llama_context_default_params(); - } - - late final _llama_context_default_paramsPtr = - _lookup>( - 'llama_context_default_params'); - late final _llama_context_default_params = _llama_context_default_paramsPtr - .asFunction(); - - bool llama_mmap_supported() { - return _llama_mmap_supported(); - } - - late final _llama_mmap_supportedPtr = - _lookup>('llama_mmap_supported'); - late final _llama_mmap_supported = - _llama_mmap_supportedPtr.asFunction(); - - bool llama_mlock_supported() { - return _llama_mlock_supported(); - } - - late final _llama_mlock_supportedPtr = - _lookup>('llama_mlock_supported'); - late final _llama_mlock_supported = - _llama_mlock_supportedPtr.asFunction(); - - ffi.Pointer llama_init_from_file( - ffi.Pointer path_model, - llama_context_params params, - ) { - return _llama_init_from_file( - path_model, - params, - ); - } - - late final _llama_init_from_filePtr = _lookup< - ffi.NativeFunction< - ffi.Pointer Function(ffi.Pointer, - llama_context_params)>>('llama_init_from_file'); - late final _llama_init_from_file = _llama_init_from_filePtr.asFunction< - ffi.Pointer Function( - ffi.Pointer, llama_context_params)>(); - - void llama_free( - ffi.Pointer ctx, - ) { - return _llama_free( - ctx, - ); - } - - late final _llama_freePtr = _lookup< - ffi.NativeFunction)>>( - 'llama_free'); - late final _llama_free = - _llama_freePtr.asFunction)>(); - - int llama_model_quantize( - ffi.Pointer fname_inp, - ffi.Pointer fname_out, - int ftype, - ) { - return _llama_model_quantize( - fname_inp, - fname_out, - ftype, - ); - } - - late final _llama_model_quantizePtr = _lookup< - ffi.NativeFunction< - ffi.Int Function(ffi.Pointer, ffi.Pointer, - ffi.Int32)>>('llama_model_quantize'); - late final _llama_model_quantize = _llama_model_quantizePtr.asFunction< - int Function(ffi.Pointer, ffi.Pointer, int)>(); - - ffi.Pointer llama_get_kv_cache( - ffi.Pointer ctx, - ) { - return _llama_get_kv_cache( - ctx, - ); - } - - late final _llama_get_kv_cachePtr = _lookup< - ffi.NativeFunction< - ffi.Pointer Function( - ffi.Pointer)>>('llama_get_kv_cache'); - late final _llama_get_kv_cache = _llama_get_kv_cachePtr.asFunction< - ffi.Pointer Function(ffi.Pointer)>(); - - int llama_get_kv_cache_size( - ffi.Pointer ctx, - ) { - return _llama_get_kv_cache_size( - ctx, - ); - } - - late final _llama_get_kv_cache_sizePtr = _lookup< - ffi.NativeFunction)>>( - 'llama_get_kv_cache_size'); - late final _llama_get_kv_cache_size = _llama_get_kv_cache_sizePtr - .asFunction)>(); - - int llama_get_kv_cache_token_count( - ffi.Pointer ctx, - ) { - return _llama_get_kv_cache_token_count( - ctx, - ); - } - - late final _llama_get_kv_cache_token_countPtr = - _lookup)>>( - 'llama_get_kv_cache_token_count'); - late final _llama_get_kv_cache_token_count = - _llama_get_kv_cache_token_countPtr - .asFunction)>(); - - void llama_set_kv_cache( - ffi.Pointer ctx, - ffi.Pointer kv_cache, - int n_size, - int n_token_count, - ) { - return _llama_set_kv_cache( - ctx, - kv_cache, - n_size, - n_token_count, - ); - } - - late final _llama_set_kv_cachePtr = _lookup< - ffi.NativeFunction< - ffi.Void Function(ffi.Pointer, ffi.Pointer, - ffi.Size, ffi.Int)>>('llama_set_kv_cache'); - late final _llama_set_kv_cache = _llama_set_kv_cachePtr.asFunction< - void Function( - ffi.Pointer, ffi.Pointer, int, int)>(); - - int llama_eval( - ffi.Pointer ctx, - ffi.Pointer tokens, - int n_tokens, - int n_past, - int n_threads, - ) { - return _llama_eval( - ctx, - tokens, - n_tokens, - n_past, - n_threads, - ); - } - - late final _llama_evalPtr = _lookup< - ffi.NativeFunction< - ffi.Int Function(ffi.Pointer, ffi.Pointer, - ffi.Int, ffi.Int, ffi.Int)>>('llama_eval'); - late final _llama_eval = _llama_evalPtr.asFunction< - int Function(ffi.Pointer, ffi.Pointer, int, - int, int)>(); - - int llama_tokenize( - ffi.Pointer ctx, - ffi.Pointer text, - ffi.Pointer tokens, - int n_max_tokens, - bool add_bos, - ) { - return _llama_tokenize( - ctx, - text, - tokens, - n_max_tokens, - add_bos, - ); - } - - late final _llama_tokenizePtr = _lookup< - ffi.NativeFunction< - ffi.Int Function(ffi.Pointer, ffi.Pointer, - ffi.Pointer, ffi.Int, ffi.Bool)>>('llama_tokenize'); - late final _llama_tokenize = _llama_tokenizePtr.asFunction< - int Function(ffi.Pointer, ffi.Pointer, - ffi.Pointer, int, bool)>(); - - int llama_n_vocab( - ffi.Pointer ctx, - ) { - return _llama_n_vocab( - ctx, - ); - } - - late final _llama_n_vocabPtr = - _lookup)>>( - 'llama_n_vocab'); - late final _llama_n_vocab = - _llama_n_vocabPtr.asFunction)>(); - - int llama_n_ctx( - ffi.Pointer ctx, - ) { - return _llama_n_ctx( - ctx, - ); - } - - late final _llama_n_ctxPtr = - _lookup)>>( - 'llama_n_ctx'); - late final _llama_n_ctx = - _llama_n_ctxPtr.asFunction)>(); - - int llama_n_embd( - ffi.Pointer ctx, - ) { - return _llama_n_embd( - ctx, - ); - } - - late final _llama_n_embdPtr = - _lookup)>>( - 'llama_n_embd'); - late final _llama_n_embd = - _llama_n_embdPtr.asFunction)>(); - - ffi.Pointer llama_get_logits( - ffi.Pointer ctx, - ) { - return _llama_get_logits( - ctx, - ); - } - - late final _llama_get_logitsPtr = _lookup< - ffi.NativeFunction< - ffi.Pointer Function( - ffi.Pointer)>>('llama_get_logits'); - late final _llama_get_logits = _llama_get_logitsPtr.asFunction< - ffi.Pointer Function(ffi.Pointer)>(); - - ffi.Pointer llama_get_embeddings( - ffi.Pointer ctx, - ) { - return _llama_get_embeddings( - ctx, - ); - } - - late final _llama_get_embeddingsPtr = _lookup< - ffi.NativeFunction< - ffi.Pointer Function( - ffi.Pointer)>>('llama_get_embeddings'); - late final _llama_get_embeddings = _llama_get_embeddingsPtr.asFunction< - ffi.Pointer Function(ffi.Pointer)>(); - - ffi.Pointer llama_token_to_str( - ffi.Pointer ctx, - int token, - ) { - return _llama_token_to_str( - ctx, - token, - ); - } - - late final _llama_token_to_strPtr = _lookup< - ffi.NativeFunction< - ffi.Pointer Function( - ffi.Pointer, llama_token)>>('llama_token_to_str'); - late final _llama_token_to_str = _llama_token_to_strPtr.asFunction< - ffi.Pointer Function(ffi.Pointer, int)>(); - - int llama_token_bos() { - return _llama_token_bos(); - } - - late final _llama_token_bosPtr = - _lookup>('llama_token_bos'); - late final _llama_token_bos = - _llama_token_bosPtr.asFunction(); - - int llama_token_eos() { - return _llama_token_eos(); - } - - late final _llama_token_eosPtr = - _lookup>('llama_token_eos'); - late final _llama_token_eos = - _llama_token_eosPtr.asFunction(); - - int llama_sample_top_p_top_k( - ffi.Pointer ctx, - ffi.Pointer last_n_tokens_data, - int last_n_tokens_size, - int top_k, - double top_p, - double temp, - double repeat_penalty, - ) { - return _llama_sample_top_p_top_k( - ctx, - last_n_tokens_data, - last_n_tokens_size, - top_k, - top_p, - temp, - repeat_penalty, - ); - } - - late final _llama_sample_top_p_top_kPtr = _lookup< - ffi.NativeFunction< - llama_token Function( - ffi.Pointer, - ffi.Pointer, - ffi.Int, - ffi.Int, - ffi.Float, - ffi.Float, - ffi.Float)>>('llama_sample_top_p_top_k'); - late final _llama_sample_top_p_top_k = - _llama_sample_top_p_top_kPtr.asFunction< - int Function(ffi.Pointer, ffi.Pointer, - int, int, double, double, double)>(); - - void llama_print_timings( - ffi.Pointer ctx, - ) { - return _llama_print_timings( - ctx, - ); - } - - late final _llama_print_timingsPtr = _lookup< - ffi.NativeFunction)>>( - 'llama_print_timings'); - late final _llama_print_timings = _llama_print_timingsPtr - .asFunction)>(); - - void llama_reset_timings( - ffi.Pointer ctx, - ) { - return _llama_reset_timings( - ctx, - ); - } - - late final _llama_reset_timingsPtr = _lookup< - ffi.NativeFunction)>>( - 'llama_reset_timings'); - late final _llama_reset_timings = _llama_reset_timingsPtr - .asFunction)>(); - - ffi.Pointer llama_print_system_info() { - return _llama_print_system_info(); - } - - late final _llama_print_system_infoPtr = - _lookup Function()>>( - 'llama_print_system_info'); - late final _llama_print_system_info = _llama_print_system_infoPtr - .asFunction Function()>(); -} - -typedef va_list = ffi.Pointer; - -class __crt_locale_data_public extends ffi.Struct { - external ffi.Pointer _locale_pctype; - - @ffi.Int() - external int _locale_mb_cur_max; - - @ffi.UnsignedInt() - external int _locale_lc_codepage; -} - -class __crt_locale_pointers extends ffi.Struct { - external ffi.Pointer<__crt_locale_data> locinfo; - - external ffi.Pointer<__crt_multibyte_data> mbcinfo; -} - -class __crt_locale_data extends ffi.Opaque {} - -class __crt_multibyte_data extends ffi.Opaque {} - -class _Mbstatet extends ffi.Struct { - @ffi.UnsignedLong() - external int _Wchar; - - @ffi.UnsignedShort() - external int _Byte; - - @ffi.UnsignedShort() - external int _State; -} - -typedef errno_t = ffi.Int; - -class gpt_params extends ffi.Struct { - @ffi.Int32() - external int seed; - - @ffi.Int32() - external int n_threads; - - @ffi.Int32() - external int n_predict; - - @ffi.Int32() - external int repeat_last_n; - - @ffi.Int32() - external int n_parts; - - @ffi.Int32() - external int n_ctx; - - @ffi.Int32() - external int n_batch; - - @ffi.Int32() - external int n_keep; - - @ffi.Int32() - external int top_k; - - @ffi.Float() - external double top_p; - - @ffi.Float() - external double temp; - - @ffi.Float() - external double repeat_penalty; - - @ffi.Int() - external int std; - - @ffi.Bool() - external bool memory_f16; - - @ffi.Bool() - external bool random_prompt; - - @ffi.Bool() - external bool use_color; - - @ffi.Bool() - external bool interactive; - - @ffi.Bool() - external bool embedding; - - @ffi.Bool() - external bool interactive_start; - - @ffi.Bool() - external bool instruct; - - @ffi.Bool() - external bool ignore_eos; - - @ffi.Bool() - external bool perplexity; - - @ffi.Bool() - external bool use_mmap; - - @ffi.Bool() - external bool use_mlock; - - @ffi.Bool() - external bool mem_test; - - @ffi.Bool() - external bool verbose_prompt; -} - -class llama_context extends ffi.Opaque {} - -class llama_token_data extends ffi.Struct { - @llama_token() - external int id; - - @ffi.Float() - external double p; - - @ffi.Float() - external double plog; -} - -typedef llama_token = ffi.Int; - -class llama_context_params extends ffi.Struct { - @ffi.Int() - external int n_ctx; - - @ffi.Int() - external int n_parts; - - @ffi.Int() - external int seed; - - @ffi.Bool() - external bool f16_kv; - - @ffi.Bool() - external bool logits_all; - - @ffi.Bool() - external bool vocab_only; - - @ffi.Bool() - external bool use_mmap; - - @ffi.Bool() - external bool use_mlock; - - @ffi.Bool() - external bool embedding; - - external llama_progress_callback progress_callback; - - external ffi.Pointer progress_callback_user_data; -} - -typedef llama_progress_callback = ffi.Pointer< - ffi.NativeFunction)>>; - -abstract class llama_ftype { - static const int LLAMA_FTYPE_ALL_F32 = 0; - static const int LLAMA_FTYPE_MOSTLY_F16 = 1; - static const int LLAMA_FTYPE_MOSTLY_Q4_0 = 2; - static const int LLAMA_FTYPE_MOSTLY_Q4_1 = 3; -} - -const int _VCRT_COMPILER_PREPROCESSOR = 1; - -const int _SAL_VERSION = 20; - -const int __SAL_H_VERSION = 180000000; - -const int _USE_DECLSPECS_FOR_SAL = 0; - -const int _USE_ATTRIBUTES_FOR_SAL = 0; - -const int _CRT_PACKING = 8; - -const int _VCRUNTIME_DISABLED_WARNINGS = 4514; - -const int _HAS_EXCEPTIONS = 1; - -const int _WCHAR_T_DEFINED = 1; - -const int NULL = 0; - -const int _HAS_CXX17 = 0; - -const int _HAS_CXX20 = 0; - -const int _HAS_CXX23 = 0; - -const int _HAS_NODISCARD = 1; - -const int _ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE = 1; - -const int _CRT_BUILD_DESKTOP_APP = 1; - -const int _UCRT_DISABLED_WARNINGS = 4324; - -const int _ARGMAX = 100; - -const int _TRUNCATE = -1; - -const int _CRT_INT_MAX = 2147483647; - -const int _CRT_SIZE_MAX = -1; - -const String __FILEW__ = 'C'; - -const int _CRT_FUNCTIONS_REQUIRED = 1; - -const int _CRT_HAS_CXX17 = 0; - -const int _CRT_HAS_C11 = 0; - -const int _CRT_INTERNAL_NONSTDC_NAMES = 1; - -const int __STDC_SECURE_LIB__ = 200411; - -const int __GOT_SECURE_LIB__ = 200411; - -const int __STDC_WANT_SECURE_LIB__ = 1; - -const int _SECURECRT_FILL_BUFFER_PATTERN = 254; - -const int _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES = 0; - -const int _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES_COUNT = 0; - -const int _CRT_SECURE_CPP_OVERLOAD_SECURE_NAMES = 1; - -const int _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES_MEMORY = 0; - -const int _CRT_SECURE_CPP_OVERLOAD_SECURE_NAMES_MEMORY = 0; - -const int INT8_MIN = -128; - -const int INT16_MIN = -32768; - -const int INT32_MIN = -2147483648; - -const int INT64_MIN = -9223372036854775808; - -const int INT8_MAX = 127; - -const int INT16_MAX = 32767; - -const int INT32_MAX = 2147483647; - -const int INT64_MAX = 9223372036854775807; - -const int UINT8_MAX = 255; - -const int UINT16_MAX = 65535; - -const int UINT32_MAX = 4294967295; - -const int UINT64_MAX = -1; - -const int INT_LEAST8_MIN = -128; - -const int INT_LEAST16_MIN = -32768; - -const int INT_LEAST32_MIN = -2147483648; - -const int INT_LEAST64_MIN = -9223372036854775808; - -const int INT_LEAST8_MAX = 127; - -const int INT_LEAST16_MAX = 32767; - -const int INT_LEAST32_MAX = 2147483647; - -const int INT_LEAST64_MAX = 9223372036854775807; - -const int UINT_LEAST8_MAX = 255; - -const int UINT_LEAST16_MAX = 65535; - -const int UINT_LEAST32_MAX = 4294967295; - -const int UINT_LEAST64_MAX = -1; - -const int INT_FAST8_MIN = -128; - -const int INT_FAST16_MIN = -2147483648; - -const int INT_FAST32_MIN = -2147483648; - -const int INT_FAST64_MIN = -9223372036854775808; - -const int INT_FAST8_MAX = 127; - -const int INT_FAST16_MAX = 2147483647; - -const int INT_FAST32_MAX = 2147483647; - -const int INT_FAST64_MAX = 9223372036854775807; - -const int UINT_FAST8_MAX = 255; - -const int UINT_FAST16_MAX = 4294967295; - -const int UINT_FAST32_MAX = 4294967295; - -const int UINT_FAST64_MAX = -1; - -const int INTPTR_MIN = -9223372036854775808; - -const int INTPTR_MAX = 9223372036854775807; - -const int UINTPTR_MAX = -1; - -const int INTMAX_MIN = -9223372036854775808; - -const int INTMAX_MAX = 9223372036854775807; - -const int UINTMAX_MAX = -1; - -const int PTRDIFF_MIN = -9223372036854775808; - -const int PTRDIFF_MAX = 9223372036854775807; - -const int SIZE_MAX = -1; - -const int SIG_ATOMIC_MIN = -2147483648; - -const int SIG_ATOMIC_MAX = 2147483647; - -const int WCHAR_MIN = 0; - -const int WCHAR_MAX = 65535; - -const int WINT_MIN = 0; - -const int WINT_MAX = 65535; - -const int __bool_true_false_are_defined = 1; - -const int false1 = 0; - -const int true1 = 1; - -const int LLAMA_FILE_VERSION = 1; - -const int LLAMA_FILE_MAGIC = 1734830708; - -const int LLAMA_FILE_MAGIC_UNVERSIONED = 1734831468; diff --git a/lib/generated_bindings_llamasherpa.dart b/lib/generated_bindings_llamasherpa.dart new file mode 100644 index 00000000..15c9d416 --- /dev/null +++ b/lib/generated_bindings_llamasherpa.dart @@ -0,0 +1,74 @@ +// AUTO GENERATED FILE, DO NOT EDIT. +// +// Generated by `package:ffigen`. +import 'dart:ffi' as ffi; + +class NativeLibrary { + /// Holds the symbol lookup function. + final ffi.Pointer Function(String symbolName) + _lookup; + + /// The symbols are looked up in [dynamicLibrary]. + NativeLibrary(ffi.DynamicLibrary dynamicLibrary) + : _lookup = dynamicLibrary.lookup; + + /// The symbols are looked up with [lookup]. + NativeLibrary.fromLookup( + ffi.Pointer Function(String symbolName) + lookup) + : _lookup = lookup; + + int llamasherpa_start( + ffi.Pointer model_path, + ffi.Pointer _prompt, + ffi.Pointer _antiprompt, + ffi.Pointer show_output, + ) { + return _llamasherpa_start( + model_path, + _prompt, + _antiprompt, + show_output, + ); + } + + late final _llamasherpa_startPtr = _lookup< + ffi.NativeFunction< + ffi.Int Function( + ffi.Pointer, + ffi.Pointer, + ffi.Pointer, + ffi.Pointer)>>('llamasherpa_start'); + late final _llamasherpa_start = _llamasherpa_startPtr.asFunction< + int Function(ffi.Pointer, ffi.Pointer, + ffi.Pointer, ffi.Pointer)>(); + + int llamasherpa_continue( + ffi.Pointer input, + ffi.Pointer show_output, + ) { + return _llamasherpa_continue( + input, + show_output, + ); + } + + late final _llamasherpa_continuePtr = _lookup< + ffi.NativeFunction< + ffi.Int Function(ffi.Pointer, + ffi.Pointer)>>('llamasherpa_continue'); + late final _llamasherpa_continue = _llamasherpa_continuePtr.asFunction< + int Function(ffi.Pointer, ffi.Pointer)>(); + + void llamasherpa_exit() { + return _llamasherpa_exit(); + } + + late final _llamasherpa_exitPtr = + _lookup>('llamasherpa_exit'); + late final _llamasherpa_exit = + _llamasherpa_exitPtr.asFunction(); +} + +typedef show_output_cb + = ffi.NativeFunction)>; diff --git a/lib/lib.dart b/lib/lib.dart index cfcf46f6..24d85a83 100644 --- a/lib/lib.dart +++ b/lib/lib.dart @@ -15,7 +15,7 @@ import 'package:flutter/services.dart'; import 'package:permission_handler/permission_handler.dart'; import 'package:sherpa/ModelFilePath.dart'; -import 'package:sherpa/generated_bindings_llama.dart'; +import 'package:sherpa/generated_bindings_llamasherpa.dart'; import 'package:sherpa/main.dart'; // import 'package:sherpa/generated_bindings.dart'; @@ -155,37 +155,6 @@ class Lib { return pointerPointer; } - static Future loadDllAndroid( - String fileName, ByteData byteData) async { - final buffer = byteData.buffer; - - Directory tempDir = await getApplicationDocumentsDirectory(); - String tempPath = tempDir.path; - - File file = await File('$tempPath/${fileName}').writeAsBytes( - buffer.asUint8List(byteData.offsetInBytes, byteData.lengthInBytes)); - print('size of file ${file.lengthSync()}'); - return DynamicLibrary.open(file.path); - } - - static Future loadDllWindows( - String fileName, ByteData byteData) async { - final buffer = byteData.buffer; - Directory tempDir = await getTemporaryDirectory(); - String tempPath = tempDir.path; - File file = await File( - '$tempPath/${DateTime.now().millisecondsSinceEpoch}$fileName') - .writeAsBytes( - buffer.asUint8List(byteData.offsetInBytes, byteData.lengthInBytes)); - print('size of file ${file.lengthSync()}'); - return DynamicLibrary.open(file.path); - } - - static Future loadDllDart(String fileName) async { - print('loadDllDart $fileName'); - return DynamicLibrary.open(fileName); - } - // std::vector llama_tokenize(struct llama_context * ctx, const gptParams.ref. & text, gptParams.ref. add_bos) { // // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars // std::vector res(text.size() + (int)add_bos); @@ -195,18 +164,6 @@ class Lib { // return res; // } - static Vector tokenize(NativeLibrary llamaBinded, - Pointer ctx, String text, bool add_bos) { - var resLength = text.length + (add_bos ? 1 : 0); - var res = malloc.allocate(sizeOf() * resLength); - var vector = Vector(res, resLength); - var n = llamaBinded.llama_tokenize( - ctx, text.toNativeUtf8().cast(), res, resLength, add_bos); - assert(n >= 0); - vector.length = n; - - return vector; - } static parserIsolateFunction( SendPort mainSendPort, @@ -254,20 +211,12 @@ class Lib { required void Function(String log) printLnLog, required void Function(String log) printLog, required String promptPassed, + required String firstInteraction, required void Function() done, required void Function() canStop, required String stopToken, required ParamsLlamaValuesOnly paramsLlamaValuesOnly, }) async { - ByteData libAndroid = await rootBundle.load('assets/libs/libllama.so'); - ByteData? libWindows; - try { - libWindows = await rootBundle.load('assets/libs/llama.dll'); - } catch (e) {} - ByteData? libLinux; - try { - libLinux = await rootBundle.load('assets/libs/libllama_linux.so'); - } catch (e) {} RootIsolateToken? token = ServicesBinding.rootIsolateToken; mainSendPort = mainReceivePort.sendPort; _isolate = await runZonedGuarded( @@ -280,11 +229,9 @@ class Lib { if (message is SendPort) { isolateSendPort = message; isolateSendPort?.send(ParsingDemand( - libLinux: libLinux, - libAndroid: libAndroid, - libWindows: libWindows, rootIsolateToken: token, promptPassed: promptPassed, + firstInteraction: firstInteraction, stopToken: stopToken, paramsLlamaValuesOnly: paramsLlamaValuesOnly, )); @@ -334,119 +281,33 @@ class Lib { static Completer interaction = Completer(); + static void showOutput(Pointer output) { + logInline(output.cast().toDartString()); + } + binaryIsolate({ required ParsingDemand parsingDemand, required SendPort mainSendPort, required String stopToken, }) async { interaction.complete(); - String ttlString = ""; - var stopTokenTrimed = stopToken.trim().replaceAll(' ', ''); - int stopTokenLength = stopTokenTrimed.length; mainPort = mainSendPort; if (parsingDemand.rootIsolateToken == null) return; BackgroundIsolateBinaryMessenger.ensureInitialized( parsingDemand.rootIsolateToken!); - DynamicLibrary llama = Platform.isAndroid - ? await loadDllAndroid("libllama.so", parsingDemand.libAndroid) - : (Platform.isIOS - ? DynamicLibrary.executable() - : (Platform.isWindows - ? await loadDllWindows("llama.dll", parsingDemand.libWindows!) - : (Platform.isMacOS - ? DynamicLibrary.executable() - : (Platform.isLinux - ? await loadDllWindows( - "libllama_linux.so", parsingDemand.libLinux!) - : DynamicLibrary.executable())))); - var prompt = parsingDemand.promptPassed; + DynamicLibrary llamasherpa = + Platform.isMacOS || Platform.isIOS + ? DynamicLibrary.process() // macos and ios + : (DynamicLibrary.open( + Platform.isWindows // windows + ? 'llamasherpa.dll' + : 'libllamasherpa.so')); // android and linux log( - "llama loaded", + "llamasherpa loaded", ); - // DynamicLibrary main = await loadDll("libmain.so"); - - // NativeLibrary ggmlAutobinded = NativeLibrary(ggml); - NativeLibrary llamaBinded = NativeLibrary(llama); - // NativeLibrary mainBinded = NativeLibrary(main); - - // print("1${llama.providesSymbol('add')}"); - // print("2${llama.providesSymbol('llama_free')}"); - // print("3${llama.providesSymbol('llama_hparams')}"); - // print("4${llama.providesSymbol('llama_model_load')}"); - - // print("5${ggml.providesSymbol('ggml_time_init')}"); - // final chat = llama.lookupFunction('add'); - - // dev.log('calling native function'); - // final result = chat(2, 15); - // dev.log('result is $result'); // 42 - - // print("trying lookup ggml_time_init"); - // ggmlAutobinded.ggml_time_init(); - - // print("ggml_time_init done"); - - // print("initialize found : ${main.providesSymbol('initialize')}"); - - var gptParams = malloc.allocate(sizeOf()); - gptParams.ref.seed = int.parse(parsingDemand.paramsLlamaValuesOnly.seed); - gptParams.ref.n_threads = int.parse( - parsingDemand.paramsLlamaValuesOnly.n_threads); // number of threads - gptParams.ref.n_predict = int.parse( - parsingDemand.paramsLlamaValuesOnly.n_predict); // number of predictions - gptParams.ref.repeat_last_n = int.parse(parsingDemand - .paramsLlamaValuesOnly.repeat_last_n); // repeat last n tokens - gptParams.ref.n_parts = int.parse( - parsingDemand.paramsLlamaValuesOnly.n_parts); // number of parts - gptParams.ref.n_ctx = int.parse(parsingDemand - .paramsLlamaValuesOnly.n_ctx); // number of tokens in context - gptParams.ref.top_k = - int.parse(parsingDemand.paramsLlamaValuesOnly.top_k); // top k sampling - gptParams.ref.top_p = double.parse( - parsingDemand.paramsLlamaValuesOnly.top_p); // top p sampling - gptParams.ref.temp = - double.parse(parsingDemand.paramsLlamaValuesOnly.temp); // temperature - gptParams.ref.repeat_penalty = double.parse( - parsingDemand.paramsLlamaValuesOnly.repeat_penalty); // repeat penalty - gptParams.ref.n_batch = int.parse( - parsingDemand.paramsLlamaValuesOnly.n_batch); // number of batches - gptParams.ref.memory_f16 = parsingDemand.paramsLlamaValuesOnly.memory_f16; - gptParams.ref.random_prompt = - parsingDemand.paramsLlamaValuesOnly.random_prompt; - gptParams.ref.use_color = parsingDemand.paramsLlamaValuesOnly.use_color; - gptParams.ref.interactive = parsingDemand.paramsLlamaValuesOnly.interactive; - gptParams.ref.interactive_start = - parsingDemand.paramsLlamaValuesOnly.interactive_start; - gptParams.ref.instruct = parsingDemand.paramsLlamaValuesOnly.instruct; - gptParams.ref.ignore_eos = parsingDemand.paramsLlamaValuesOnly.ignore_eos; - gptParams.ref.perplexity = parsingDemand.paramsLlamaValuesOnly.perplexity; - - gptParams.ref.use_mlock = false; // use mlock to keep model in memory - gptParams.ref.mem_test = false; // compute maximum memory usage - gptParams.ref.verbose_prompt = - false; // print prompt tokens before generation - gptParams.ref.n_keep = 0; - gptParams.ref.embedding = false; - gptParams.ref.use_mmap = true; - - var params = gptParams.ref; - log("main found : ${llama.providesSymbol('llama_context_default_params')}"); - - log("trying main"); - - var ret = llamaBinded.llama_context_default_params(); - log("trying main DONE $ret"); - - ret.n_ctx = params.n_ctx; - ret.n_parts = params.n_parts; - ret.seed = params.seed; - ret.f16_kv = params.memory_f16; - ret.use_mmap = params.use_mmap; - ret.use_mlock = params.use_mlock; - var filePath = await ModelFilePath.getFilePath(); print("filePath : $filePath"); if (filePath == null) { @@ -454,222 +315,32 @@ class Lib { return; } - var ctx = llamaBinded.llama_init_from_file( - filePath.toNativeUtf8().cast(), ret); - // || ctx.cast().value != 0 - if (ctx == nullptr) { - log("context error : ${CreationContextError(ctx.cast().value).toString()}"); - llamaBinded.llama_free(ctx); - return; - } + var prompt = parsingDemand.promptPassed; + var firstInteraction = parsingDemand.firstInteraction; + interaction = Completer(); - log("context created"); + Pointer show_output = Pointer.fromFunction(showOutput); - log(' test info : ${(llamaBinded.llama_print_system_info()).cast().toDartString()}'); - var nbInt = 1; - var pointerInt = malloc.allocate(nbInt); - pointerInt[0] = 1; - log(' ret eval : ${llamaBinded.llama_eval(ctx, pointerInt, nbInt, 0, nbInt)}'); + NativeLibrary llamasherpaBinded = NativeLibrary(llamasherpa); + var ret = llamasherpaBinded.llamasherpa_start(filePath.toNativeUtf8().cast(), prompt.toNativeUtf8().cast(), stopToken.trim().toNativeUtf8().cast(), show_output); + // process the prompt + llamasherpaBinded.llamasherpa_continue("".toNativeUtf8().cast(), show_output); - var embd_inp = tokenize(llamaBinded, ctx, ' $prompt', true); - if (embd_inp.length < 0) { - log("error embd_inp"); - return; + // if first line of conversation was provided, pass it now + if (firstInteraction.isNotEmpty) { + llamasherpaBinded.llamasherpa_continue(firstInteraction.toNativeUtf8().cast(), show_output); } - log('embd_inp ${embd_inp.length}'); - - var n_ctx = llamaBinded.llama_n_ctx(ctx); - log('n_ctx ${n_ctx}'); - - gptParams.ref.n_predict = - min(gptParams.ref.n_predict, n_ctx - embd_inp.length); - - var inp_pfx = tokenize(llamaBinded, ctx, "\n\n### Instruction:\n\n", true); - var inp_sfx = tokenize(llamaBinded, ctx, "\n\n### Response:\n\n", false); - var user_token = tokenize(llamaBinded, ctx, "\n$stopTokenTrimed", true); - var llama_token_newline = tokenize(llamaBinded, ctx, "\n", false); - - var embd = Vector(nullptr, 0); - - int last_n_size = gptParams.ref.repeat_last_n; - var last_n_tokens = Vector( - malloc.allocate(sizeOf() * last_n_size), - last_n_size); - last_n_tokens.fillWithValue(0); - - int input_consumed = 0; - bool input_noecho = false; - - int remaining_tokens = gptParams.ref.n_predict; - int n_past = 0; - log('before while loop'); - mainSendPort.send(MessageCanStop()); - - while ((remaining_tokens > 0 || gptParams.ref.interactive)) { - log('remaining tokens : $remaining_tokens'); - log('stopGeneration : $stopGeneration'); - if (embd.length > 0) { - if (llamaBinded.llama_eval(ctx, embd.pointer, embd.length, n_past, - gptParams.ref.n_threads) > - 0) { - log("error llama_eval"); - return; - } - await Future.delayed(Duration(milliseconds: 1)); - } - n_past += embd.length; - embd.clear(); - if (stopGeneration) { - interaction = Completer(); - stopGeneration = false; - embd.insertVectorAtEnd(user_token); - } - if (embd_inp.length <= input_consumed && - interaction.isCompleted == true) { - // out of user input, sample next token - var top_k = gptParams.ref.top_k; - var top_p = gptParams.ref.top_p; - var temp = gptParams.ref.temp; - var repeat_penalty = gptParams.ref.repeat_penalty; - - int id = 0; - var logits = llamaBinded.llama_get_logits(ctx); - - if (params.ignore_eos) { - // set the logit of the eos token to zero to avoid sampling it - // logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0; - // TODO: this does not work of params.logits_all == true - assert(params.perplexity == false); - logits[llamaBinded.llama_token_eos()] = 0; - } - - id = llamaBinded.llama_sample_top_p_top_k(ctx, last_n_tokens.pointer, - last_n_tokens.length, top_k, top_p, temp, repeat_penalty); - last_n_tokens.erase(0); - last_n_tokens.push_back(id); - - // if (id == llamaBinded.llama_token_eos() && params.interactive) { - // id = llama_token_newline.begin().value; - // if (stopTokenTrimed.isNotEmpty) { - // // tokenize and inject first reverse prompt - // var first_antiprompt = - // tokenize(llamaBinded, ctx, stopTokenTrimed, false); - // embd_inp.insertVectorAtEnd(first_antiprompt); - // } - // } - - // add it to the context - embd.push_back(id); - - // echo this to console - input_noecho = false; - - // decrement remaining sampling budget - --remaining_tokens; - } else { - // some user input remains from prompt or interaction, forward it to processing - while (embd_inp.length > input_consumed) { - embd.push_back(embd_inp.pointer[input_consumed]); - last_n_tokens.erase(0); - last_n_tokens.push_back(embd_inp.pointer[input_consumed]); - ++input_consumed; - if (embd.length >= params.n_batch) { - break; - } - } - } - log('input_noecho = $input_noecho embd.length = ${embd.length}'); - if (!input_noecho) { - for (int i = 0; i < embd.length; ++i) { - try { - int id = embd.pointer[i]; - var str = llamaBinded - .llama_token_to_str(ctx, id) - .cast() - .toDartString(); - logInline(str); - ttlString += str; - if (ttlString.length >= stopTokenLength && - ttlString.length > prompt.length && - stopTokenLength > 0) { - var lastPartTtlString = ttlString - .trim() - .substring(ttlString.trim().length - stopTokenLength - 1) - .toLowerCase() - .replaceAll(' ', '') - .trim(); - log('lastPartTtlString = $lastPartTtlString , stopTokenTrimed = ${stopTokenTrimed.toLowerCase()}, equal = ${lastPartTtlString == stopTokenTrimed.toLowerCase()}'); - if (lastPartTtlString == stopTokenTrimed.toLowerCase()) { - log('is_interacting = true'); - interaction = Completer(); - break; - } - } - } catch (e) { - interaction = Completer(); - } - } - } - - if (params.interactive && embd_inp.length <= input_consumed) { - log('params.interactive && embd_inp.length <= input_consumed && interaction.isCompleted == ${interaction.isCompleted}'); - // malloc.free(pointer); - // malloc.free(pointer); - - log('${ttlString.length} ${prompt.length} ${stopTokenLength}'); - - if (interaction.isCompleted == false) { - // potentially set color to indicate we are taking user input - - // if (params.instruct) { - // input_consumed = embd_inp.length; - // embd_inp.insertVectorAtEnd(inp_pfx); - // } - - // logInline(stopTokenTrimed); - mainSendPort.send(MessageCanPrompt()); - String buffer = await interaction.future; - stopGeneration = false; - // logInline(stopTokenTrimed + '\n'); - - logInline(buffer); - ttlString += buffer; - // done taking input, reset color - - var line_inp = tokenize(llamaBinded, ctx, buffer, false); - embd_inp.insertVectorAtEnd(line_inp); - - if (params.instruct) { - embd_inp.insertVectorAtEnd(inp_sfx); - } - - remaining_tokens -= line_inp.length; - log(remaining_tokens.toString()); - - input_noecho = true; // do not echo this again - } - } - - // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. - if (params.interactive && remaining_tokens <= 0) { - remaining_tokens = params.n_predict; - interaction = Completer(); - } + while (true) { + // ask for user input + mainSendPort?.send(MessageCanPrompt()); + String buffer = await interaction.future; + interaction = Completer(); + // process user input + llamasherpaBinded.llamasherpa_continue(buffer.toNativeUtf8().cast(), show_output); } - log(''); - //unload everything from memory - //llamaBinded.llama_free works for ctx only - llamaBinded.llama_free(ctx); - - // - - //other free - malloc.free(embd_inp.pointer); - malloc.free(embd.pointer); - malloc.free(last_n_tokens.pointer); - malloc.free(ctx); - mainSendPort.send(MessageEndFromIsolate('ENDED !')); + + llamasherpaBinded.llamasherpa_exit(); } void main() {} @@ -711,20 +382,16 @@ class MessageNewPrompt { } class ParsingDemand { - ByteData libAndroid; - ByteData? libWindows; - ByteData? libLinux; RootIsolateToken? rootIsolateToken; String promptPassed; + String firstInteraction; String stopToken; ParamsLlamaValuesOnly paramsLlamaValuesOnly; ParsingDemand({ - required this.libWindows, - required this.libAndroid, required this.rootIsolateToken, required this.promptPassed, - required this.libLinux, + required this.firstInteraction, required this.stopToken, required this.paramsLlamaValuesOnly, }); diff --git a/lib/main.dart b/lib/main.dart index c8610517..2ebc84e9 100644 --- a/lib/main.dart +++ b/lib/main.dart @@ -77,18 +77,18 @@ class _MyHomePageState extends State { String prePrompt = ""; List defaultPrePrompts = [ - '### Assistant: Hello, I\'m Sherpa, your personal assistant. I can write, complex mails, code and even songs\n' - '### Human: Hello how are you ?\n' - '### Assistant: I\'m fine, thank you. How are you ?\n' - '### Human: I\'m fine too, thanks.\n' - '### Assistant: That\'s good to hear\n' - '### Human:', - 'Sherpa : Hello, I\'m Sherpa, your personal assistant. I can write, complex mails, code and even songs\n' - 'User : Hello how are you ?\n' - 'Sherpa : I\'m fine, thank you. How are you ?\n' - 'User : I\'m fine too, thanks.\n' - 'Sherpa : That\'s good to hear\n' - 'User :', + 'Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User\'s requests immediately and with precision.\n\n' + 'User: Hello, Bob.\n' + 'Bob: Hello. How may I help you today?\n' + 'User: Please tell me the largest city in Europe.\n' + 'Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.\n' + 'User:', + 'Sherpa: Hello, I\'m Sherpa, your personal assistant. I can write, complex mails, code and even songs\n' + 'User: Hello how are you ?\n' + 'Sherpa: I\'m fine, thank you. How are you ?\n' + 'User: I\'m fine too, thanks.\n' + 'Sherpa: That\'s good to hear\n' + 'User:', ]; bool inProgress = false; @@ -229,8 +229,8 @@ class _MyHomePageState extends State { ), printLnLog: printLnLog, printLog: printResult, - promptPassed: prePrompt + - promptController.text.trim() + + promptPassed: prePrompt, + firstInteraction: promptController.text.trim() + (promptController.text.isEmpty ? "" : "\n"), done: done, canStop: canUseStop, @@ -538,10 +538,11 @@ class _MyHomePageState extends State { void initDefaultPrompts() async { var prefs = await SharedPreferences.getInstance(); - if (!prefs.containsKey("prePrompts")) { + var prePrompts = await getPrePrompts(); + if (prePrompts.isEmpty) { await prefs.setStringList("prePrompts", defaultPrePrompts); + prePrompts = defaultPrePrompts; } - var prePrompts = await getPrePrompts(); var defaultPrePrompt = prefs.getString("defaultPrePrompt"); if (defaultPrePrompt != null) { prePrompt = defaultPrePrompt; @@ -552,7 +553,7 @@ class _MyHomePageState extends State { if (prefs.containsKey("reversePrompt")) { reversePromptController.text = prefs.getString("reversePrompt") ?? ""; } else { - reversePromptController.text = 'User :'; + reversePromptController.text = 'User:'; } reversePromptController.addListener(() { prefs.setString("reversePrompt", reversePromptController.text); @@ -703,8 +704,8 @@ class _MyHomePageState extends State { alignment: WrapAlignment.center, children: [ const Text( - 'Please download the 7B ggml-model-q4 from the official link meta provided you.\n' - 'Then open it.\n', + 'Please download a compatible GGML model.\n' + 'Then open it here.\n', textAlign: TextAlign.center, style: TextStyle(color: Colors.red), ), diff --git a/linux/CMakeLists.txt b/linux/CMakeLists.txt index b9a9458f..c85877c0 100644 --- a/linux/CMakeLists.txt +++ b/linux/CMakeLists.txt @@ -136,3 +136,8 @@ if(NOT CMAKE_BUILD_TYPE MATCHES "Debug") install(FILES "${AOT_LIBRARY}" DESTINATION "${INSTALL_BUNDLE_LIB_DIR}" COMPONENT Runtime) endif() + +set(BUILD_SHARED_LIBS ON) +add_subdirectory(./llama.cpp) +add_subdirectory(./llamasherpa) +install(FILES ${PROJECT_BINARY_DIR}/llamasherpa/libllamasherpa.so DESTINATION "${INSTALL_BUNDLE_LIB_DIR}" COMPONENT Runtime) diff --git a/linux/llama.cpp b/linux/llama.cpp new file mode 120000 index 00000000..7aaba50f --- /dev/null +++ b/linux/llama.cpp @@ -0,0 +1 @@ +../src/llama.cpp \ No newline at end of file diff --git a/linux/llamasherpa b/linux/llamasherpa new file mode 120000 index 00000000..6b183b44 --- /dev/null +++ b/linux/llamasherpa @@ -0,0 +1 @@ +../src/llamasherpa \ No newline at end of file diff --git a/pubspec.yaml b/pubspec.yaml index 5ca334f4..95e534db 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -60,12 +60,12 @@ dev_dependencies: # For information on the generic Dart part of this file, see the # following page: https://dart.dev/tools/pub/pubspec ffigen: - output: lib/generated_bindings_llama.dart + output: lib/generated_bindings_llamasherpa.dart headers: entry-points: # - "src/llama.cpp/utils.h" # - "src/llama.cpp/ggml.cpp" - - "src/llama.cpp/llama.h" + - "src/llamasherpa/llamasherpa.h" # The following section is specific to Flutter packages. flutter: @@ -80,7 +80,6 @@ flutter: # - images/a_dot_ham.jpeg assets: - assets/ - - assets/libs/ # An image asset can refer to one or more resolution-specific "variants", see # https://flutter.dev/assets-and-images/#resolution-aware @@ -129,4 +128,4 @@ flutter_icons: generate: true image_path: "assets/sherpa.png" - # flutter pub run msix:create \ No newline at end of file + # flutter pub run msix:create diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..17ab2b34 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required(VERSION 3.10) +set(BUILD_SHARED_LIBS ON) +add_subdirectory("./llama.cpp") +add_subdirectory(./llamasherpa) diff --git a/src/llama.cpp b/src/llama.cpp index 248efc63..dadbed99 160000 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1 +1 @@ -Subproject commit 248efc6354654f20fe30f2f8e4d7b9ff1bc6840f +Subproject commit dadbed99e65252d79f81101a392d0d6497b86caa diff --git a/src/llamasherpa/CMakeLists.txt b/src/llamasherpa/CMakeLists.txt new file mode 100644 index 00000000..a4f4008f --- /dev/null +++ b/src/llamasherpa/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.9) +project(llamasherpa VERSION 1.0.0 DESCRIPTION "llamasherpa") +add_library(llamasherpa SHARED llamasherpa.cpp) +target_include_directories(llamasherpa PRIVATE ../llama.cpp) +target_link_libraries(llamasherpa llama) diff --git a/src/llamasherpa/llamasherpa.cpp b/src/llamasherpa/llamasherpa.cpp new file mode 100644 index 00000000..a51841c3 --- /dev/null +++ b/src/llamasherpa/llamasherpa.cpp @@ -0,0 +1,410 @@ +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include "llama.h" +#include "llamasherpa.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static bool is_interacting = false; +static const int initial_n_ctx = 512; +static int n_ctx; +static const int n_batch = 1024; +static const int n_predict = 256; +static int n_keep = 48; +static const float repeat_penalty = 1.0; +static const int32_t repeat_last_n = 64; +static const float presence_penalty = 0.00f; +static const float frequency_penalty = 0.00f; +static const int32_t top_k = 40; +static const float top_p = 0.95f; +static const float typical_p = 1.00f; +static const float temp = 0.80f; +static const float tfs_z = 1.00f; +static int mirostat = 0; +static const float mirostat_tau = 5.00f; +static const float mirostat_eta = 0.10f; +static const bool penalize_nl = true; + +int32_t get_num_physical_cores() { +#ifdef __linux__ + // enumerate the set of thread siblings, num entries is num cores + std::unordered_set siblings; + for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { + std::ifstream thread_siblings("/sys/devices/system/cpu" + + std::to_string(cpu) + "/topology/thread_siblings"); + if (!thread_siblings.is_open()) { + break; // no more cpus + } + std::string line; + if (std::getline(thread_siblings, line)) { + siblings.insert(line); + } + } + if (siblings.size() > 0) { + return static_cast(siblings.size()); + } +#elif defined(__APPLE__) && defined(__MACH__) + int32_t num_physical_cores; + size_t len = sizeof(num_physical_cores); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } +#elif defined(_WIN32) + //TODO: Implement +#endif + unsigned int n_threads = std::thread::hardware_concurrency(); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +} +static int32_t n_threads = get_num_physical_cores(); + +// TODO: not great allocating this every time +std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector res(text.size() + (int) add_bos); + const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); + res.resize(n); + + return res; +} + +static llama_model *model; +static llama_context *ctx; +static std::vector llama_token_newline; +static std::string antiprompt; + +// TODO: replace with ring-buffer +static std::vector last_n_tokens; +static bool is_antiprompt = false; + +static int n_past = 0; +static int n_remain = n_predict; +static int n_consumed = 0; + +static std::vector embd; +static std::vector embd_inp; + +int llamasherpa_start(const char *model_path, const char *_prompt, const char *_antiprompt, show_output_cb *show_output) { + + llama_backend_init(false); + antiprompt = _antiprompt; + + // load the model and apply lora adapter, if any + auto lparams = llama_context_default_params(); + + lparams.n_ctx = initial_n_ctx; + lparams.n_batch = n_batch; + lparams.seed = time(0); + + model = llama_load_model_from_file(model_path, lparams); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model_path); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__,model_path); + llama_free_model(model); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", + n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); + } + + std::string prompt(_prompt); + if (prompt.back() == '\n') { + prompt.pop_back(); + } + + // Add a space in front of the first character to match OG llama tokenizer behavior + prompt.insert(0, 1, ' '); + + // tokenize the prompt + embd_inp = ::llama_tokenize(ctx, prompt, true); + + n_ctx = llama_n_ctx(ctx); + + if ((int) embd_inp.size() > n_ctx - 4) { + fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // number of tokens to keep when resetting context + if (n_keep > (int) embd_inp.size()) { + n_keep = (int)embd_inp.size(); + } + + // determine newline token + llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + fprintf(stderr, "%s: interactive mode on.\n", __func__); + + if (antiprompt.length() > 0) { + fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); + } + + fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", + repeat_last_n, repeat_penalty, presence_penalty, frequency_penalty, top_k, tfs_z, top_p, typical_p, temp, mirostat, mirostat_eta, mirostat_tau); + fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, n_batch, n_predict, n_keep); + fprintf(stderr, "\n\n"); + + last_n_tokens = std::vector(n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + // do one empty run to warm up the model + { + const std::vector tmp = { llama_token_bos(), }; + llama_eval(ctx, tmp.data(), tmp.size(), 0, n_threads); + llama_reset_timings(ctx); + } + + return 0; +} + +int llamasherpa_continue(const char *input, show_output_cb *show_output) { + std::string buffer(input); + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + auto line_inp = ::llama_tokenize(ctx, buffer, false); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + n_remain -= line_inp.size(); + } + + if (n_past > 0) { + is_interacting = false; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + if (n_remain <= 0 && n_predict != -1) { + n_remain = n_predict; + is_interacting = true; + } + + while (true) { + // predict + if (embd.size() > 0) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + auto max_embd_size = n_ctx - 4; + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int)embd.size() > max_embd_size) { + auto skipped_tokens = embd.size() - max_embd_size; + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + fflush(stdout); + embd.resize(max_embd_size); + } + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() > n_ctx) { + const int n_left = n_past - n_keep; + + // always keep the first token - BOS + n_past = std::max(1, n_keep); + + // insert n_left/2 tokens at the start of embd from last_n_tokens + embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + + //printf("\n---\n"); + //printf("resetting: '"); + //for (int i = 0; i < (int) embd.size(); i++) { + // printf("%s", llama_token_to_str(ctx, embd[i])); + //} + //printf("'\n"); + //printf("\n---\n"); + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + for (int i = 0; i < (int) embd.size(); i += n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_eval(ctx, &embd[i], n_eval, n_past, n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + n_past += n_eval; + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + // out of user input, sample next token + const float alpha_presence = presence_penalty; + const float alpha_frequency = frequency_penalty; + + llama_token id = 0; + + { + auto logits = llama_get_logits(ctx); + auto n_vocab = llama_n_vocab(ctx); + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // Apply penalties + float nl_logit = logits[llama_token_nl()]; + auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); + llama_sample_repetition_penalty(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, + last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + if (!penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &candidates_p); + } else { + if (mirostat == 1) { + static float mirostat_mu = 2.0f * mirostat_tau; + const int mirostat_m = 100; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); + } else if (mirostat == 2) { + static float mirostat_mu = 2.0f * mirostat_tau; + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k(ctx, &candidates_p, top_k, 1); + llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1); + llama_sample_typical(ctx, &candidates_p, typical_p, 1); + llama_sample_top_p(ctx, &candidates_p, top_p, 1); + llama_sample_temperature(ctx, &candidates_p, temp); + id = llama_sample_token(ctx, &candidates_p); + } + } + // printf("`%d`", candidates_p.size); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } + + // replace end of text token with newline token when in interactive mode + if (id == llama_token_eos()) { + id = llama_token_newline.front(); + if (antiprompt.length() > 0) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, antiprompt, false); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + } + } + + // add it to the context + embd.push_back(id); + + // decrement remaining sampling budget + --n_remain; + } else { + // some user input remains from prompt or interaction, forward it to processing + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[n_consumed]); + ++n_consumed; + if ((int) embd.size() >= n_batch) { + break; + } + } + } + + // display text + for (auto id : embd) { + show_output(llama_token_to_str(ctx, id)); + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + + // check for reverse prompt + if (antiprompt.length() > 0) { + std::string last_output; + for (auto id : last_n_tokens) { + last_output += llama_token_to_str(ctx, id); + } + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + size_t extra_padding = 0; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) { + is_interacting = true; + is_antiprompt = true; + fflush(stdout); + } + } + + if (n_past > 0 && is_interacting) { + return 0; + } + + if (n_past > 0) { + is_interacting = false; + } + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + if (n_remain <= 0 && n_predict != -1) { + n_remain = n_predict; + is_interacting = true; + } + } + + return 0; +} + +void llamasherpa_exit(void) { + llama_print_timings(ctx); + llama_free(ctx); + llama_free_model(model); +} diff --git a/src/llamasherpa/llamasherpa.h b/src/llamasherpa/llamasherpa.h new file mode 100644 index 00000000..82375e95 --- /dev/null +++ b/src/llamasherpa/llamasherpa.h @@ -0,0 +1,18 @@ +#ifndef __LLAMASHERPA_H +#define __LLAMASHERPA_H + +#ifdef WIN32 + #define EXPORT __declspec(dllexport) +#else + #define EXPORT extern "C" __attribute__((visibility("default"))) __attribute__((used)) +#endif + +typedef void show_output_cb(const char *); + +EXPORT int llamasherpa_start(const char *model_path, const char *_prompt, const char *_antiprompt, show_output_cb *show_output); + +EXPORT int llamasherpa_continue(const char *input, show_output_cb *show_output); + +EXPORT void llamasherpa_exit(void); + +#endif diff --git a/windows/runner/CMakeLists.txt b/windows/runner/CMakeLists.txt index 394917c0..63c7a28c 100644 --- a/windows/runner/CMakeLists.txt +++ b/windows/runner/CMakeLists.txt @@ -38,3 +38,6 @@ target_include_directories(${BINARY_NAME} PRIVATE "${CMAKE_SOURCE_DIR}") # Run the Flutter tool portions of the build. This must not be removed. add_dependencies(${BINARY_NAME} flutter_assemble) + +set(BUILD_SHARED_LIBS ON) +add_subdirectory(../../src/llama.cpp)