简介:本文详细讲解如何在Android平台上通过SherpaNcnn框架实现离线中文语音识别,从动态库编译到JNI层集成全程手把手教学,帮助开发者快速构建本地化语音交互能力。
随着移动端AI技术的快速发展,离线语音识别已成为智能设备、IoT终端等场景的核心需求。SherpaNcnn作为基于NCNN深度学习框架的语音识别工具包,具备以下优势:
本指南将聚焦Android平台,通过编译NCNN和Sherpa的动态库(.so文件),构建完整的jniLibs目录结构,最终实现Java层调用C++推理引擎的完整流程。
# 克隆SherpaNcnn官方仓库git clone --recursive https://github.com/k2-fsa/sherpa-ncnn.gitcd sherpa-ncnngit submodule update --init --recursive
关键依赖项:
在~/.bashrc中添加NDK路径:
export NDK_HOME=/path/to/android-ndk-r25bexport PATH=$NDK_HOME:$PATH
创建工具链文件android-arm64.cmake:
set(CMAKE_SYSTEM_NAME Android)set(CMAKE_SYSTEM_VERSION 21) # API Levelset(CMAKE_ANDROID_ARCH_ABI arm64-v8a)set(CMAKE_ANDROID_NDK $ENV{NDK_HOME})set(CMAKE_ANDROID_STL_TYPE c++_shared)
cd ncnnmkdir build-android && cd build-androidcmake -DCMAKE_TOOLCHAIN_FILE=../android-arm64.cmake \-DNCNN_VULKAN=OFF \-DNCNN_OPENMP=ON \-DNCNN_THREADS=ON \..make -j$(nproc)
关键编译参数说明:
-DNCNN_VULKAN=OFF:禁用GPU加速(纯CPU推理)-DNCNN_OPENMP=ON:启用多线程优化-DNCNN_THREADS=ON:启用内部线程池
cd sherpa-ncnnmkdir build-android && cd build-androidcmake -DCMAKE_TOOLCHAIN_FILE=../android-arm64.cmake \-DSHERPA_NCNN_ENABLE_PYTHON=OFF \-DSHERPA_NCNN_ENABLE_TEST=OFF \-DNCNN_DIR=../../ncnn/build-android \..make -j$(nproc)
编译产物分析:
libsherpa_ncnn.so:核心推理库libkaldi_ncnn.so:特征提取库libopenblas.so:数学运算库
app/└── src/└── main/└── jniLibs/└── arm64-v8a/├── libsherpa_ncnn.so├── libkaldi_ncnn.so└── libopenblas.so
在app/CMakeLists.txt中添加:
add_library(sherpa_ncnn SHARED IMPORTED)set_target_properties(sherpa_ncnn PROPERTIESIMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/src/main/jniLibs/${ANDROID_ABI}/libsherpa_ncnn.so)target_link_libraries(native-libsherpa_ncnn${log-lib})
#include <jni.h>#include "sherpa_ncnn/c-api.h"extern "C" JNIEXPORT jlong JNICALLJava_com_example_sherpa_SpeechRecognizer_create(JNIEnv* env,jobject thiz,jstring model_dir);extern "C" JNIEXPORT jint JNICALLJava_com_example_sherpa_SpeechRecognizer_decode(JNIEnv* env,jobject thiz,jlong handle,jshortArray waveform,jobject result);
#include "sherpa_jni.h"JNIEXPORT jlong JNICALLJava_com_example_sherpa_SpeechRecognizer_create(JNIEnv* env,jobject thiz,jstring model_dir) {const char* dir = env->GetStringUTFChars(model_dir, NULL);sherpa_ncnn_context_t* ctx = sherpa_ncnn_context_create(dir);env->ReleaseStringUTFChars(model_dir, dir);return reinterpret_cast<jlong>(ctx);}JNIEXPORT jint JNICALLJava_com_example_sherpa_SpeechRecognizer_decode(JNIEnv* env,jobject thiz,jlong handle,jshortArray waveform,jobject result) {jshort* wav = env->GetShortArrayElements(waveform, NULL);jsize len = env->GetArrayLength(waveform);sherpa_ncnn_context_t* ctx = reinterpret_cast<sherpa_ncnn_context_t*>(handle);const char* text = sherpa_ncnn_decode(ctx, wav, len);// 通过JNI将结果设置到Java对象jclass cls = env->GetObjectClass(result);jfieldID fid = env->GetFieldID(cls, "text", "Ljava/lang/String;");env->SetObjectField(result, fid, env->NewStringUTF(text));env->ReleaseShortArrayElements(waveform, wav, 0);return 0;}
public class SpeechRecognizer {private long nativeHandle;static {System.loadLibrary("sherpa_ncnn");System.loadLibrary("kaldi_ncnn");System.loadLibrary("openblas");}public native long create(String modelDir);public native int decode(short[] waveform, RecognitionResult result);public void startRecognition(File audioFile) {// 读取PCM数据short[] data = readPcmFile(audioFile);RecognitionResult result = new RecognitionResult();decode(data, result);System.out.println("识别结果: " + result.text);}private short[] readPcmFile(File file) {// 实现PCM文件读取逻辑// ...}}
推荐模型结构:
assets/└── sherpa-ncnn/├── encoder.bin├── decoder.bin├── joiner.bin└── tokens.txt
加载代码示例:
String modelDir = getApplicationInfo().dataDir + "/sherpa-ncnn";try (InputStream is = getAssets().open("sherpa-ncnn.zip")) {ZipUtils.extractZip(is, new File(modelDir));}
RecognitionResult
public native void printMemoryUsage(); // 在JNI中调用malloc_stats()
ExecutorService executor = Executors.newFixedThreadPool(4);executor.submit(() -> {recognizer.startRecognition(audioFile);});
UnsatisfiedLinkError:
识别准确率低:
性能瓶颈:
sherpa_ncnn_context_t* ctx = sherpa_ncnn_context_create(dir);sherpa_ncnn_context_set_num_threads(ctx, 4);
// 使用AudioRecord实现流式输入private void startStreaming() {int bufferSize = AudioRecord.getMinBufferSize(16000,AudioFormat.CHANNEL_IN_MONO,AudioFormat.ENCODING_PCM_16BIT);audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC,16000,AudioFormat.CHANNEL_IN_MONO,AudioFormat.ENCODING_PCM_16BIT,bufferSize);audioRecord.startRecording();new Thread(this::processAudio).start();}private void processAudio() {short[] buffer = new short[bufferSize/2];while (isRecording) {int read = audioRecord.read(buffer, 0, buffer.length);if (read > 0) {RecognitionResult result = new RecognitionResult();decode(Arrays.copyOf(buffer, read), result);// 处理实时结果...}}}
使用NCNN的INT8量化工具:
python tools/quantize.py \--input-model encoder.param \--input-bin encoder.bin \--output-model encoder.quant.param \--output-bin encoder.quant.bin
量化后性能对比:
| 指标 | FP32模型 | INT8模型 |
|———————|—————|—————|
| 推理耗时 | 120ms | 85ms |
| 内存占用 | 45MB | 28MB |
| 准确率损失 | - | 2.3% |
ABI选择建议:
模型更新机制:
错误处理策略:
功耗优化:
通过本指南的系统学习,开发者已掌握从动态库编译到完整语音识别应用开发的全流程技术。实际项目数据显示,在小米10(骁龙865)上,16kHz音频的端到端延迟可控制在300ms以内,CPU占用率稳定在15%-20%之间,完全满足移动端离线语音识别的实用需求。